File size: 4,552 Bytes
1ee39a9
b1a9f46
 
 
 
 
 
bed9d39
 
 
 
 
b1a9f46
bed9d39
 
 
 
b1a9f46
 
 
54ff036
bed9d39
 
b1a9f46
 
 
bed9d39
 
 
 
 
 
 
54ff036
b1a9f46
 
 
 
bed9d39
 
54ff036
bed9d39
b1a9f46
54ff036
b1a9f46
 
 
bed9d39
 
 
b1a9f46
 
54ff036
bed9d39
54ff036
 
 
 
 
 
b1a9f46
bed9d39
 
 
54ff036
bed9d39
 
54ff036
bed9d39
 
 
 
 
b1a9f46
 
bed9d39
b1a9f46
54ff036
b1a9f46
bed9d39
 
 
 
 
54ff036
 
 
b1a9f46
 
 
54ff036
 
 
b1a9f46
 
54ff036
 
 
b1a9f46
 
bed9d39
 
54ff036
 
 
bed9d39
 
b1a9f46
 
1ee39a9
bed9d39
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
import pandas as pd
import duckdb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression

def load_data(parquet_file: str) -> pd.DataFrame:
    con = duckdb.connect(database=':memory:')
    query = f"""
    CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
    """
    con.execute(query)
    df = con.execute("SELECT * FROM data").fetchdf()
    return df

def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
    X = df.select_dtypes(include=[float, int]).copy()
    y = df[target_column].copy()
    X.replace([np.inf, -np.inf], np.nan, inplace=True)
    X = X.fillna(X.median())
    y = y.fillna(y.median())
    if k_best is not None and k_best < X.shape[1]:
        selector = SelectKBest(score_func=f_regression, k=k_best)
        X_selected = selector.fit_transform(X, y)
        selected_indices = selector.get_support(indices=True)
        X = X.iloc[:, selected_indices]
    return X, y

def apply_pca(X: pd.DataFrame, n_components: int = 5):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)
    return X_pca, pca

def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'):
    scatter_plot_file = 'pca_scatter.png'
    plt.figure(figsize=(10, 6))
    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title('PCA - First Two Principal Components')
    cbar = plt.colorbar()
    cbar.set_label(target_label)
    plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
    plt.close()

    # Pair plot
    pair_plot_file = 'pca_pairplot.png'
    num_components = min(X_pca.shape[1], 5)
    pca_df = pd.DataFrame(X_pca[:, :num_components],
                          columns=[f'PC{i+1}' for i in range(num_components)])
    pca_df[target_label] = y.values
    sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)],
                 hue=target_label, palette='viridis')
    plt.suptitle('Pair Plot of Principal Components', y=1.02)
    plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
    plt.close()

    # Scree plot
    scree_plot_file = 'pca_scree.png'
    plt.figure(figsize=(8, 5))
    plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
    plt.xlabel('Principal Components')
    plt.ylabel('Variance Explained')
    plt.title('Scree Plot')
    plt.xticks(range(1, pca.n_components_ + 1))
    plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight')
    plt.close()

    return scatter_plot_file, pair_plot_file, scree_plot_file

def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0):
    df = load_data('df_usa_health_features.parquet')
    X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
    X_pca, pca_model = apply_pca(X, n_components)
    scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
    return scatter_plot, pair_plot, scree_plot

# ------------------------------------------------------------------------------
# HERE is the updated Gradio interface with direct component calls (no .inputs)
# ------------------------------------------------------------------------------
iface = gr.Interface(
    fn=gradio_interface,
    inputs=[
        gr.Textbox(label="Target Column", value="Median_Income_Household"),
        gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
        gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
    ],
    outputs=[
        gr.Image(type="file", label="PCA Scatter Plot"),
        gr.Image(type="file", label="PCA Pair Plot"),
        gr.Image(type="file", label="Scree Plot")
    ],
    title="PCA Visualization with DuckDB and Gradio",
    description=(
        "Load data from a Parquet file, optionally perform feature selection, "
        "run PCA, and visualize the results.\n"
        "1) Enter the target column name (e.g., 'Median_Income_Household').\n"
        "2) Choose the number of PCA components.\n"
        "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
    )
)

if __name__ == "__main__":
    iface.launch()