File size: 4,552 Bytes
1ee39a9 b1a9f46 bed9d39 b1a9f46 bed9d39 b1a9f46 54ff036 bed9d39 b1a9f46 bed9d39 54ff036 b1a9f46 bed9d39 54ff036 bed9d39 b1a9f46 54ff036 b1a9f46 bed9d39 b1a9f46 54ff036 bed9d39 54ff036 b1a9f46 bed9d39 54ff036 bed9d39 54ff036 bed9d39 b1a9f46 bed9d39 b1a9f46 54ff036 b1a9f46 bed9d39 54ff036 b1a9f46 54ff036 b1a9f46 54ff036 b1a9f46 bed9d39 54ff036 bed9d39 b1a9f46 1ee39a9 bed9d39 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 |
import gradio as gr
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
def load_data(parquet_file: str) -> pd.DataFrame:
con = duckdb.connect(database=':memory:')
query = f"""
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
"""
con.execute(query)
df = con.execute("SELECT * FROM data").fetchdf()
return df
def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
X = df.select_dtypes(include=[float, int]).copy()
y = df[target_column].copy()
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X = X.fillna(X.median())
y = y.fillna(y.median())
if k_best is not None and k_best < X.shape[1]:
selector = SelectKBest(score_func=f_regression, k=k_best)
X_selected = selector.fit_transform(X, y)
selected_indices = selector.get_support(indices=True)
X = X.iloc[:, selected_indices]
return X, y
def apply_pca(X: pd.DataFrame, n_components: int = 5):
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
return X_pca, pca
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'):
scatter_plot_file = 'pca_scatter.png'
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - First Two Principal Components')
cbar = plt.colorbar()
cbar.set_label(target_label)
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
plt.close()
# Pair plot
pair_plot_file = 'pca_pairplot.png'
num_components = min(X_pca.shape[1], 5)
pca_df = pd.DataFrame(X_pca[:, :num_components],
columns=[f'PC{i+1}' for i in range(num_components)])
pca_df[target_label] = y.values
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)],
hue=target_label, palette='viridis')
plt.suptitle('Pair Plot of Principal Components', y=1.02)
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
plt.close()
# Scree plot
scree_plot_file = 'pca_scree.png'
plt.figure(figsize=(8, 5))
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
plt.xlabel('Principal Components')
plt.ylabel('Variance Explained')
plt.title('Scree Plot')
plt.xticks(range(1, pca.n_components_ + 1))
plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight')
plt.close()
return scatter_plot_file, pair_plot_file, scree_plot_file
def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0):
df = load_data('df_usa_health_features.parquet')
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
X_pca, pca_model = apply_pca(X, n_components)
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
return scatter_plot, pair_plot, scree_plot
# ------------------------------------------------------------------------------
# HERE is the updated Gradio interface with direct component calls (no .inputs)
# ------------------------------------------------------------------------------
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.Textbox(label="Target Column", value="Median_Income_Household"),
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
],
outputs=[
gr.Image(type="file", label="PCA Scatter Plot"),
gr.Image(type="file", label="PCA Pair Plot"),
gr.Image(type="file", label="Scree Plot")
],
title="PCA Visualization with DuckDB and Gradio",
description=(
"Load data from a Parquet file, optionally perform feature selection, "
"run PCA, and visualize the results.\n"
"1) Enter the target column name (e.g., 'Median_Income_Household').\n"
"2) Choose the number of PCA components.\n"
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
)
)
if __name__ == "__main__":
iface.launch()
|