|
import gradio as gr |
|
import pandas as pd |
|
import duckdb |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
from sklearn.decomposition import PCA |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.feature_selection import SelectKBest, f_regression |
|
|
|
def load_data(parquet_file: str) -> pd.DataFrame: |
|
con = duckdb.connect(database=':memory:') |
|
query = f""" |
|
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}') |
|
""" |
|
con.execute(query) |
|
df = con.execute("SELECT * FROM data").fetchdf() |
|
return df |
|
|
|
def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None): |
|
X = df.select_dtypes(include=[float, int]).copy() |
|
y = df[target_column].copy() |
|
X.replace([np.inf, -np.inf], np.nan, inplace=True) |
|
X = X.fillna(X.median()) |
|
y = y.fillna(y.median()) |
|
|
|
|
|
if k_best is not None and k_best < X.shape[1]: |
|
selector = SelectKBest(score_func=f_regression, k=k_best) |
|
X_selected = selector.fit_transform(X, y) |
|
selected_indices = selector.get_support(indices=True) |
|
X = X.iloc[:, selected_indices] |
|
|
|
return X, y |
|
|
|
def apply_pca(X: pd.DataFrame, n_components: int = 5): |
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(X) |
|
pca = PCA(n_components=n_components) |
|
X_pca = pca.fit_transform(X_scaled) |
|
return X_pca, pca |
|
|
|
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str): |
|
|
|
scatter_plot_file = 'pca_scatter.png' |
|
plt.figure(figsize=(10, 6)) |
|
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50) |
|
plt.xlabel('Principal Component 1') |
|
plt.ylabel('Principal Component 2') |
|
plt.title('PCA - First Two Principal Components') |
|
cbar = plt.colorbar() |
|
cbar.set_label(target_label) |
|
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight') |
|
plt.close() |
|
|
|
|
|
pair_plot_file = 'pca_pairplot.png' |
|
num_components = min(X_pca.shape[1], 5) |
|
pca_df = pd.DataFrame(X_pca[:, :num_components], |
|
columns=[f'PC{i+1}' for i in range(num_components)]) |
|
pca_df[target_label] = y.values |
|
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)], |
|
hue=target_label, palette='viridis') |
|
plt.suptitle('Pair Plot of Principal Components', y=1.02) |
|
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight') |
|
plt.close() |
|
|
|
|
|
scree_plot_file = 'pca_scree.png' |
|
plt.figure(figsize=(8, 5)) |
|
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red') |
|
plt.xlabel('Principal Components') |
|
plt.ylabel('Variance Explained') |
|
plt.title('Scree Plot') |
|
plt.xticks(range(1, pca.n_components_ + 1)) |
|
plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight') |
|
plt.close() |
|
|
|
return scatter_plot_file, pair_plot_file, scree_plot_file |
|
|
|
def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0): |
|
df = load_data('df_usa_health_features.parquet') |
|
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None) |
|
X_pca, pca_model = apply_pca(X, n_components) |
|
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column) |
|
return scatter_plot, pair_plot, scree_plot |
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.Textbox(label="Target Column", value="Median_Income_Household"), |
|
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5), |
|
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0) |
|
], |
|
|
|
outputs=[ |
|
gr.Image(type="filepath", label="PCA Scatter Plot"), |
|
gr.Image(type="filepath", label="PCA Pair Plot"), |
|
gr.Image(type="filepath", label="Scree Plot") |
|
], |
|
title="PCA Visualization with DuckDB and Gradio", |
|
description=( |
|
"Load data from a Parquet file, optionally perform feature selection, " |
|
"run PCA, and visualize the results.\n" |
|
"1) Enter the target column name (e.g., 'Median_Income_Household').\n" |
|
"2) Choose the number of PCA components.\n" |
|
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)." |
|
) |
|
) |
|
|
|
if __name__ == "__main__": |
|
iface.launch() |
|
|