baho / app.py
LeonceNsh's picture
Update app.py
b1a9f46 verified
raw
history blame
3.03 kB
import gradio as gr
import pandas as pd
import duckdb
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Function to load data from a Parquet file into a DuckDB in-memory database
def load_data(parquet_file):
con = duckdb.connect(database=':memory:')
con.execute(f"CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')")
df = con.execute("SELECT * FROM data").fetchdf()
return df
# Function to preprocess data and perform PCA
def preprocess_and_pca(df, target_column, n_components=5):
# Drop non-numeric columns
X = df.select_dtypes(include=[float, int])
y = df[target_column]
# Replace infinity values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)
# Handle missing values by imputing with the median
X = X.fillna(X.median())
y = y.fillna(y.median())
# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Apply PCA
pca = PCA(n_components=n_components)
X_pca = pca.fit_transform(X_scaled)
return X_pca, y
# Function to visualize the PCA components
def visualize_pca(X_pca, y):
# Visualize the first two principal components
plt.figure(figsize=(10, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA - First Two Principal Components')
plt.colorbar(label='Median Income Household')
plt.savefig('pca_scatter.png')
plt.close()
# Create a DataFrame with the first few principal components for pair plot
pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
pca_df['Median_Income_Household'] = y
# Pair plot of the first few principal components
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(5)], hue='Median_Income_Household', palette='viridis')
plt.suptitle('Pair Plot of Principal Components', y=1.02)
plt.savefig('pca_pairplot.png')
plt.close()
return 'pca_scatter.png', 'pca_pairplot.png'
# Gradio interface function
def gradio_interface(target_column):
df = load_data('df_usa_health_features.parquet')
X_pca, y = preprocess_and_pca(df, target_column)
scatter_plot, pair_plot = visualize_pca(X_pca, y)
return scatter_plot, pair_plot
# Create Gradio interface
iface = gr.Interface(
fn=gradio_interface,
inputs=[
gr.inputs.Textbox(label="Target Column")
],
outputs=[
gr.outputs.Image(type="file", label="PCA Scatter Plot"),
gr.outputs.Image(type="file", label="PCA Pair Plot")
],
title="PCA Visualization with DuckDB and Gradio",
description="Specify the target column to visualize PCA components from the df_usa_health_features.parquet file."
)
# Launch the Gradio app
if __name__ == "__main__":
iface.launch()