import gradio as gr |
import pandas as pd |
import duckdb |
import numpy as np |
import matplotlib.pyplot as plt |
import seaborn as sns |
from sklearn.decomposition import PCA |
from sklearn.preprocessing import StandardScaler |
from sklearn.feature_selection import SelectKBest, f_regression |
def load_data(parquet_file: str) -> pd.DataFrame: |
con = duckdb.connect(database=':memory:') |
query = f""" |
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}') |
""" |
con.execute(query) |
df = con.execute("SELECT * FROM data").fetchdf() |
return df |
def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None): |
X = df.select_dtypes(include=[float, int]).copy() |
y = df[target_column].copy() |
X.replace([np.inf, -np.inf], np.nan, inplace=True) |
X = X.fillna(X.median()) |
y = y.fillna(y.median()) |
if k_best is not None and k_best < X.shape[1]: |
selector = SelectKBest(score_func=f_regression, k=k_best) |
X_selected = selector.fit_transform(X, y) |
selected_indices = selector.get_support(indices=True) |
X = X.iloc[:, selected_indices] |
return X, y |
def apply_pca(X: pd.DataFrame, n_components: int = 5): |
scaler = StandardScaler() |
X_scaled = scaler.fit_transform(X) |
pca = PCA(n_components=n_components) |
X_pca = pca.fit_transform(X_scaled) |
return X_pca, pca |
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str): |
scatter_plot_file = 'pca_scatter.png' |
plt.figure(figsize=(10, 6)) |
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50) |
plt.xlabel('Principal Component 1') |
plt.ylabel('Principal Component 2') |
plt.title('PCA - First Two Principal Components') |
cbar = plt.colorbar() |
cbar.set_label(target_label) |
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight') |
plt.close() |
pair_plot_file = 'pca_pairplot.png' |
num_components = min(X_pca.shape[1], 5) |
pca_df = pd.DataFrame(X_pca[:, :num_components], |
columns=[f'PC{i+1}' for i in range(num_components)]) |
pca_df[target_label] = y.values |
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)], |
hue=target_label, palette='viridis') |
plt.suptitle('Pair Plot of Principal Components', y=1.02) |
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight') |
plt.close() |
scree_plot_file = 'pca_scree.png' |
plt.figure(figsize=(8, 5)) |
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red') |
plt.xlabel('Principal Components') |
plt.ylabel('Variance Explained') |
plt.title('Scree Plot') |
plt.xticks(range(1, pca.n_components_ + 1)) |
plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight') |
plt.close() |
return scatter_plot_file, pair_plot_file, scree_plot_file |
def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0): |
df = load_data('df_usa_health_features.parquet') |
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None) |
X_pca, pca_model = apply_pca(X, n_components) |
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column) |
return scatter_plot, pair_plot, scree_plot |
iface = gr.Interface( |
fn=gradio_interface, |
inputs=[ |
gr.Textbox(label="Target Column", value="Median_Income_Household"), |
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5), |
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0) |
], |
outputs=[ |
gr.Image(type="filepath", label="PCA Scatter Plot"), |
gr.Image(type="filepath", label="PCA Pair Plot"), |
gr.Image(type="filepath", label="Scree Plot") |
], |
title="PCA Visualization with DuckDB and Gradio", |
description=( |
"Load data from a Parquet file, optionally perform feature selection, " |
"run PCA, and visualize the results.\n" |
"1) Enter the target column name (e.g., 'Median_Income_Household').\n" |
"2) Choose the number of PCA components.\n" |
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)." |
) |
) |
if __name__ == "__main__": |
iface.launch() |