import gradio as gr import pandas as pd import duckdb import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler from sklearn.feature_selection import SelectKBest, f_regression def load_data(parquet_file: str) -> pd.DataFrame: con = duckdb.connect(database=':memory:') query = f""" CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}') """ con.execute(query) df = con.execute("SELECT * FROM data").fetchdf() return df def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None): X = df.select_dtypes(include=[float, int]).copy() y = df[target_column].copy() X.replace([np.inf, -np.inf], np.nan, inplace=True) X = X.fillna(X.median()) y = y.fillna(y.median()) if k_best is not None and k_best < X.shape[1]: selector = SelectKBest(score_func=f_regression, k=k_best) X_selected = selector.fit_transform(X, y) selected_indices = selector.get_support(indices=True) X = X.iloc[:, selected_indices] return X, y def apply_pca(X: pd.DataFrame, n_components: int = 5): scaler = StandardScaler() X_scaled = scaler.fit_transform(X) pca = PCA(n_components=n_components) X_pca = pca.fit_transform(X_scaled) return X_pca, pca def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'): scatter_plot_file = 'pca_scatter.png' plt.figure(figsize=(10, 6)) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50) plt.xlabel('Principal Component 1') plt.ylabel('Principal Component 2') plt.title('PCA - First Two Principal Components') cbar = plt.colorbar() cbar.set_label(target_label) plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight') plt.close() # Pair plot pair_plot_file = 'pca_pairplot.png' num_components = min(X_pca.shape[1], 5) pca_df = pd.DataFrame(X_pca[:, :num_components], columns=[f'PC{i+1}' for i in range(num_components)]) pca_df[target_label] = y.values sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)], hue=target_label, palette='viridis') plt.suptitle('Pair Plot of Principal Components', y=1.02) plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight') plt.close() # Scree plot scree_plot_file = 'pca_scree.png' plt.figure(figsize=(8, 5)) plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red') plt.xlabel('Principal Components') plt.ylabel('Variance Explained') plt.title('Scree Plot') plt.xticks(range(1, pca.n_components_ + 1)) plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight') plt.close() return scatter_plot_file, pair_plot_file, scree_plot_file def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0): df = load_data('df_usa_health_features.parquet') X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None) X_pca, pca_model = apply_pca(X, n_components) scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column) return scatter_plot, pair_plot, scree_plot # ------------------------------------------------------------------------------ # HERE is the updated Gradio interface with direct component calls (no .inputs) # ------------------------------------------------------------------------------ iface = gr.Interface( fn=gradio_interface, inputs=[ gr.Textbox(label="Target Column", value="Median_Income_Household"), gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5), gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0) ], outputs=[ gr.Image(type="file", label="PCA Scatter Plot"), gr.Image(type="file", label="PCA Pair Plot"), gr.Image(type="file", label="Scree Plot") ], title="PCA Visualization with DuckDB and Gradio", description=( "Load data from a Parquet file, optionally perform feature selection, " "run PCA, and visualize the results.\n" "1) Enter the target column name (e.g., 'Median_Income_Household').\n" "2) Choose the number of PCA components.\n" "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)." ) ) if __name__ == "__main__": iface.launch()