|
import gradio as gr |
|
import pandas as pd |
|
import duckdb |
|
from sklearn.decomposition import PCA |
|
from sklearn.preprocessing import StandardScaler |
|
from sklearn.feature_selection import SelectKBest, f_regression |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import seaborn as sns |
|
|
|
|
|
def load_data(parquet_file): |
|
con = duckdb.connect(database=':memory:') |
|
con.execute(f"CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')") |
|
df = con.execute("SELECT * FROM data").fetchdf() |
|
return df |
|
|
|
|
|
def preprocess_and_pca(df, target_column, n_components=5): |
|
|
|
X = df.select_dtypes(include=[float, int]) |
|
y = df[target_column] |
|
|
|
|
|
X.replace([np.inf, -np.inf], np.nan, inplace=True) |
|
|
|
|
|
X = X.fillna(X.median()) |
|
y = y.fillna(y.median()) |
|
|
|
|
|
scaler = StandardScaler() |
|
X_scaled = scaler.fit_transform(X) |
|
|
|
|
|
pca = PCA(n_components=n_components) |
|
X_pca = pca.fit_transform(X_scaled) |
|
|
|
return X_pca, y |
|
|
|
|
|
def visualize_pca(X_pca, y): |
|
|
|
plt.figure(figsize=(10, 6)) |
|
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50) |
|
plt.xlabel('Principal Component 1') |
|
plt.ylabel('Principal Component 2') |
|
plt.title('PCA - First Two Principal Components') |
|
plt.colorbar(label='Median Income Household') |
|
plt.savefig('pca_scatter.png') |
|
plt.close() |
|
|
|
|
|
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])]) |
|
pca_df['Median_Income_Household'] = y |
|
|
|
|
|
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(5)], hue='Median_Income_Household', palette='viridis') |
|
plt.suptitle('Pair Plot of Principal Components', y=1.02) |
|
plt.savefig('pca_pairplot.png') |
|
plt.close() |
|
|
|
return 'pca_scatter.png', 'pca_pairplot.png' |
|
|
|
|
|
def gradio_interface(target_column): |
|
df = load_data('df_usa_health_features.parquet') |
|
X_pca, y = preprocess_and_pca(df, target_column) |
|
scatter_plot, pair_plot = visualize_pca(X_pca, y) |
|
return scatter_plot, pair_plot |
|
|
|
|
|
iface = gr.Interface( |
|
fn=gradio_interface, |
|
inputs=[ |
|
gr.inputs.Textbox(label="Target Column") |
|
], |
|
outputs=[ |
|
gr.outputs.Image(type="file", label="PCA Scatter Plot"), |
|
gr.outputs.Image(type="file", label="PCA Pair Plot") |
|
], |
|
title="PCA Visualization with DuckDB and Gradio", |
|
description="Specify the target column to visualize PCA components from the df_usa_health_features.parquet file." |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |