Update app.py
Browse files
app.py
CHANGED
@@ -24,11 +24,14 @@ def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
|
|
24 |
X.replace([np.inf, -np.inf], np.nan, inplace=True)
|
25 |
X = X.fillna(X.median())
|
26 |
y = y.fillna(y.median())
|
|
|
|
|
27 |
if k_best is not None and k_best < X.shape[1]:
|
28 |
selector = SelectKBest(score_func=f_regression, k=k_best)
|
29 |
X_selected = selector.fit_transform(X, y)
|
30 |
selected_indices = selector.get_support(indices=True)
|
31 |
X = X.iloc[:, selected_indices]
|
|
|
32 |
return X, y
|
33 |
|
34 |
def apply_pca(X: pd.DataFrame, n_components: int = 5):
|
@@ -38,7 +41,8 @@ def apply_pca(X: pd.DataFrame, n_components: int = 5):
|
|
38 |
X_pca = pca.fit_transform(X_scaled)
|
39 |
return X_pca, pca
|
40 |
|
41 |
-
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str
|
|
|
42 |
scatter_plot_file = 'pca_scatter.png'
|
43 |
plt.figure(figsize=(10, 6))
|
44 |
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
|
@@ -50,7 +54,7 @@ def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Med
|
|
50 |
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
|
51 |
plt.close()
|
52 |
|
53 |
-
# Pair plot
|
54 |
pair_plot_file = 'pca_pairplot.png'
|
55 |
num_components = min(X_pca.shape[1], 5)
|
56 |
pca_df = pd.DataFrame(X_pca[:, :num_components],
|
@@ -62,7 +66,7 @@ def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Med
|
|
62 |
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
|
63 |
plt.close()
|
64 |
|
65 |
-
# Scree plot
|
66 |
scree_plot_file = 'pca_scree.png'
|
67 |
plt.figure(figsize=(8, 5))
|
68 |
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
|
@@ -82,9 +86,6 @@ def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0)
|
|
82 |
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
|
83 |
return scatter_plot, pair_plot, scree_plot
|
84 |
|
85 |
-
# ------------------------------------------------------------------------------
|
86 |
-
# HERE is the updated Gradio interface with direct component calls (no .inputs)
|
87 |
-
# ------------------------------------------------------------------------------
|
88 |
iface = gr.Interface(
|
89 |
fn=gradio_interface,
|
90 |
inputs=[
|
@@ -92,10 +93,11 @@ iface = gr.Interface(
|
|
92 |
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
|
93 |
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
|
94 |
],
|
|
|
95 |
outputs=[
|
96 |
-
gr.Image(type="
|
97 |
-
gr.Image(type="
|
98 |
-
gr.Image(type="
|
99 |
],
|
100 |
title="PCA Visualization with DuckDB and Gradio",
|
101 |
description=(
|
|
|
24 |
X.replace([np.inf, -np.inf], np.nan, inplace=True)
|
25 |
X = X.fillna(X.median())
|
26 |
y = y.fillna(y.median())
|
27 |
+
|
28 |
+
# Optionally select top k features
|
29 |
if k_best is not None and k_best < X.shape[1]:
|
30 |
selector = SelectKBest(score_func=f_regression, k=k_best)
|
31 |
X_selected = selector.fit_transform(X, y)
|
32 |
selected_indices = selector.get_support(indices=True)
|
33 |
X = X.iloc[:, selected_indices]
|
34 |
+
|
35 |
return X, y
|
36 |
|
37 |
def apply_pca(X: pd.DataFrame, n_components: int = 5):
|
|
|
41 |
X_pca = pca.fit_transform(X_scaled)
|
42 |
return X_pca, pca
|
43 |
|
44 |
+
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str):
|
45 |
+
# 1) Scatter plot of PC1 vs PC2
|
46 |
scatter_plot_file = 'pca_scatter.png'
|
47 |
plt.figure(figsize=(10, 6))
|
48 |
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
|
|
|
54 |
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
|
55 |
plt.close()
|
56 |
|
57 |
+
# 2) Pair plot of the first few components
|
58 |
pair_plot_file = 'pca_pairplot.png'
|
59 |
num_components = min(X_pca.shape[1], 5)
|
60 |
pca_df = pd.DataFrame(X_pca[:, :num_components],
|
|
|
66 |
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
|
67 |
plt.close()
|
68 |
|
69 |
+
# 3) Scree plot
|
70 |
scree_plot_file = 'pca_scree.png'
|
71 |
plt.figure(figsize=(8, 5))
|
72 |
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
|
|
|
86 |
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
|
87 |
return scatter_plot, pair_plot, scree_plot
|
88 |
|
|
|
|
|
|
|
89 |
iface = gr.Interface(
|
90 |
fn=gradio_interface,
|
91 |
inputs=[
|
|
|
93 |
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
|
94 |
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
|
95 |
],
|
96 |
+
# Use 'filepath' instead of 'file' so Gradio knows you're returning local image paths
|
97 |
outputs=[
|
98 |
+
gr.Image(type="filepath", label="PCA Scatter Plot"),
|
99 |
+
gr.Image(type="filepath", label="PCA Pair Plot"),
|
100 |
+
gr.Image(type="filepath", label="Scree Plot")
|
101 |
],
|
102 |
title="PCA Visualization with DuckDB and Gradio",
|
103 |
description=(
|