LeonceNsh commited on
Commit
d18faab
·
verified ·
1 Parent(s): 54ff036

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -9
app.py CHANGED
@@ -24,11 +24,14 @@ def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
24
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
25
  X = X.fillna(X.median())
26
  y = y.fillna(y.median())
 
 
27
  if k_best is not None and k_best < X.shape[1]:
28
  selector = SelectKBest(score_func=f_regression, k=k_best)
29
  X_selected = selector.fit_transform(X, y)
30
  selected_indices = selector.get_support(indices=True)
31
  X = X.iloc[:, selected_indices]
 
32
  return X, y
33
 
34
  def apply_pca(X: pd.DataFrame, n_components: int = 5):
@@ -38,7 +41,8 @@ def apply_pca(X: pd.DataFrame, n_components: int = 5):
38
  X_pca = pca.fit_transform(X_scaled)
39
  return X_pca, pca
40
 
41
- def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'):
 
42
  scatter_plot_file = 'pca_scatter.png'
43
  plt.figure(figsize=(10, 6))
44
  plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
@@ -50,7 +54,7 @@ def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Med
50
  plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
51
  plt.close()
52
 
53
- # Pair plot
54
  pair_plot_file = 'pca_pairplot.png'
55
  num_components = min(X_pca.shape[1], 5)
56
  pca_df = pd.DataFrame(X_pca[:, :num_components],
@@ -62,7 +66,7 @@ def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Med
62
  plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
63
  plt.close()
64
 
65
- # Scree plot
66
  scree_plot_file = 'pca_scree.png'
67
  plt.figure(figsize=(8, 5))
68
  plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
@@ -82,9 +86,6 @@ def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0)
82
  scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
83
  return scatter_plot, pair_plot, scree_plot
84
 
85
- # ------------------------------------------------------------------------------
86
- # HERE is the updated Gradio interface with direct component calls (no .inputs)
87
- # ------------------------------------------------------------------------------
88
  iface = gr.Interface(
89
  fn=gradio_interface,
90
  inputs=[
@@ -92,10 +93,11 @@ iface = gr.Interface(
92
  gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
93
  gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
94
  ],
 
95
  outputs=[
96
- gr.Image(type="file", label="PCA Scatter Plot"),
97
- gr.Image(type="file", label="PCA Pair Plot"),
98
- gr.Image(type="file", label="Scree Plot")
99
  ],
100
  title="PCA Visualization with DuckDB and Gradio",
101
  description=(
 
24
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
25
  X = X.fillna(X.median())
26
  y = y.fillna(y.median())
27
+
28
+ # Optionally select top k features
29
  if k_best is not None and k_best < X.shape[1]:
30
  selector = SelectKBest(score_func=f_regression, k=k_best)
31
  X_selected = selector.fit_transform(X, y)
32
  selected_indices = selector.get_support(indices=True)
33
  X = X.iloc[:, selected_indices]
34
+
35
  return X, y
36
 
37
  def apply_pca(X: pd.DataFrame, n_components: int = 5):
 
41
  X_pca = pca.fit_transform(X_scaled)
42
  return X_pca, pca
43
 
44
+ def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str):
45
+ # 1) Scatter plot of PC1 vs PC2
46
  scatter_plot_file = 'pca_scatter.png'
47
  plt.figure(figsize=(10, 6))
48
  plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
 
54
  plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
55
  plt.close()
56
 
57
+ # 2) Pair plot of the first few components
58
  pair_plot_file = 'pca_pairplot.png'
59
  num_components = min(X_pca.shape[1], 5)
60
  pca_df = pd.DataFrame(X_pca[:, :num_components],
 
66
  plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
67
  plt.close()
68
 
69
+ # 3) Scree plot
70
  scree_plot_file = 'pca_scree.png'
71
  plt.figure(figsize=(8, 5))
72
  plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
 
86
  scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
87
  return scatter_plot, pair_plot, scree_plot
88
 
 
 
 
89
  iface = gr.Interface(
90
  fn=gradio_interface,
91
  inputs=[
 
93
  gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
94
  gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
95
  ],
96
+ # Use 'filepath' instead of 'file' so Gradio knows you're returning local image paths
97
  outputs=[
98
+ gr.Image(type="filepath", label="PCA Scatter Plot"),
99
+ gr.Image(type="filepath", label="PCA Pair Plot"),
100
+ gr.Image(type="filepath", label="Scree Plot")
101
  ],
102
  title="PCA Visualization with DuckDB and Gradio",
103
  description=(