LeonceNsh commited on
Commit
54ff036
·
verified ·
1 Parent(s): bed9d39

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -204
app.py CHANGED
@@ -9,162 +9,39 @@ from sklearn.decomposition import PCA
9
  from sklearn.preprocessing import StandardScaler
10
  from sklearn.feature_selection import SelectKBest, f_regression
11
 
12
- # ---------------------------------------------------------------------------
13
- # 1. LOADING DATA
14
- # ---------------------------------------------------------------------------
15
  def load_data(parquet_file: str) -> pd.DataFrame:
16
- """
17
- Load data from a Parquet file into a DuckDB in-memory database,
18
- and return the result as a pandas DataFrame.
19
-
20
- Parameters:
21
- -----------
22
- parquet_file : str
23
- The path to the Parquet file to be loaded.
24
-
25
- Returns:
26
- --------
27
- df : pd.DataFrame
28
- Pandas DataFrame containing all columns from the Parquet file.
29
- """
30
  con = duckdb.connect(database=':memory:')
31
  query = f"""
32
  CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
33
  """
34
  con.execute(query)
35
-
36
  df = con.execute("SELECT * FROM data").fetchdf()
37
  return df
38
 
39
- # ---------------------------------------------------------------------------
40
- # 2. DATA PREPROCESSING & OPTIONAL FEATURE SELECTION
41
- # ---------------------------------------------------------------------------
42
- def preprocess_data(
43
- df: pd.DataFrame,
44
- target_column: str,
45
- k_best: int = None
46
- ):
47
- """
48
- Perform data cleaning and (optionally) feature selection.
49
-
50
- Parameters:
51
- -----------
52
- df : pd.DataFrame
53
- The input DataFrame.
54
- target_column : str
55
- The name of the target variable in df.
56
- k_best : int, optional
57
- If provided, use SelectKBest with f_regression to select the
58
- top k features. If None, no feature selection is performed.
59
-
60
- Returns:
61
- --------
62
- X : pd.DataFrame
63
- Preprocessed and optionally reduced DataFrame (features).
64
- y : pd.Series
65
- The target column as a Series.
66
- """
67
- # Separate out the numeric columns for PCA
68
  X = df.select_dtypes(include=[float, int]).copy()
69
  y = df[target_column].copy()
70
-
71
- # Replace infinities with NaN
72
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
73
-
74
- # Impute missing values with the median
75
  X = X.fillna(X.median())
76
  y = y.fillna(y.median())
77
-
78
- # Optional: Use SelectKBest to filter down to top k features
79
  if k_best is not None and k_best < X.shape[1]:
80
  selector = SelectKBest(score_func=f_regression, k=k_best)
81
  X_selected = selector.fit_transform(X, y)
82
- # We still want feature names to keep track of the columns
83
  selected_indices = selector.get_support(indices=True)
84
  X = X.iloc[:, selected_indices]
85
-
86
  return X, y
87
 
88
- # ---------------------------------------------------------------------------
89
- # 3. PCA TRANSFORMATION
90
- # ---------------------------------------------------------------------------
91
- def apply_pca(
92
- X: pd.DataFrame,
93
- n_components: int = 5
94
- ):
95
- """
96
- Standardize the data and apply PCA.
97
-
98
- Parameters:
99
- -----------
100
- X : pd.DataFrame
101
- The preprocessed numeric data.
102
- n_components : int
103
- Number of principal components to retain.
104
-
105
- Returns:
106
- --------
107
- X_pca : np.ndarray
108
- PCA-transformed dataset.
109
- pca : PCA
110
- The trained PCA model (can be used for explained_variance_ etc.).
111
- """
112
- # Standardize the data
113
  scaler = StandardScaler()
114
  X_scaled = scaler.fit_transform(X)
115
-
116
- # Apply PCA
117
  pca = PCA(n_components=n_components)
118
  X_pca = pca.fit_transform(X_scaled)
119
-
120
  return X_pca, pca
121
 
122
- # ---------------------------------------------------------------------------
123
- # 4. VISUALIZATION
124
- # ---------------------------------------------------------------------------
125
- def visualize_pca(
126
- X_pca: np.ndarray,
127
- y: pd.Series,
128
- pca,
129
- target_label: str = 'Median Income Household'
130
- ):
131
- """
132
- Generate visualizations:
133
- 1) Scatter plot of first two principal components.
134
- 2) Pair plot of the first few principal components (up to 5).
135
- 3) Scree plot showing the explained variance of each component.
136
-
137
- Parameters:
138
- -----------
139
- X_pca : np.ndarray
140
- PCA-transformed data (rows x components).
141
- y : pd.Series
142
- Target values, used for coloring points in plots.
143
- pca : PCA
144
- The fitted PCA model (for explained variance, etc.).
145
- target_label : str
146
- Label for the colorbar representing the target variable.
147
-
148
- Returns:
149
- --------
150
- scatter_plot_file : str
151
- Filename for the PCA scatter plot.
152
- pair_plot_file : str
153
- Filename for the PCA pair plot.
154
- scree_plot_file : str
155
- Filename for the Scree plot.
156
- """
157
- # 4.a: Scatter plot for the first two components
158
  scatter_plot_file = 'pca_scatter.png'
159
  plt.figure(figsize=(10, 6))
160
- plt.scatter(
161
- X_pca[:, 0],
162
- X_pca[:, 1],
163
- c=y,
164
- cmap='viridis',
165
- edgecolor='k',
166
- s=50
167
- )
168
  plt.xlabel('Principal Component 1')
169
  plt.ylabel('Principal Component 2')
170
  plt.title('PCA - First Two Principal Components')
@@ -173,33 +50,22 @@ def visualize_pca(
173
  plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
174
  plt.close()
175
 
176
- # 4.b: Pair plot of the first few components
177
- num_components = min(X_pca.shape[1], 5)
178
  pair_plot_file = 'pca_pairplot.png'
179
- pca_df = pd.DataFrame(
180
- X_pca[:, :num_components],
181
- columns=[f'PC{i+1}' for i in range(num_components)]
182
- )
183
- pca_df[target_label] = y.values # Add target for hue in pairplot
184
- sns.pairplot(
185
- pca_df,
186
- vars=[f'PC{i+1}' for i in range(num_components)],
187
- hue=target_label,
188
- palette='viridis'
189
- )
190
  plt.suptitle('Pair Plot of Principal Components', y=1.02)
191
  plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
192
  plt.close()
193
 
194
- # 4.c: Scree plot for explained variance
195
  scree_plot_file = 'pca_scree.png'
196
  plt.figure(figsize=(8, 5))
197
- plt.bar(
198
- range(1, pca.n_components_ + 1),
199
- pca.explained_variance_ratio_,
200
- alpha=0.7,
201
- color='red'
202
- )
203
  plt.xlabel('Principal Components')
204
  plt.ylabel('Variance Explained')
205
  plt.title('Scree Plot')
@@ -209,81 +75,37 @@ def visualize_pca(
209
 
210
  return scatter_plot_file, pair_plot_file, scree_plot_file
211
 
212
- # ---------------------------------------------------------------------------
213
- # 5. GRADIO INTERFACE FUNCTION
214
- # ---------------------------------------------------------------------------
215
- def gradio_interface(
216
- target_column: str,
217
- n_components: int = 5,
218
- k_best: int = 0
219
- ):
220
- """
221
- Main interface function that:
222
- - Loads the Parquet data
223
- - Preprocesses the data (optionally using SelectKBest feature selection)
224
- - Applies PCA
225
- - Visualizes the PCA results
226
- - Returns file paths to the generated plots.
227
-
228
- Parameters:
229
- -----------
230
- target_column : str
231
- The name of the target column in the DataFrame.
232
- n_components : int
233
- Number of PCA components to use.
234
- k_best : int
235
- If > 0, select the top k_best features before PCA.
236
- If 0 or None, no feature selection is performed.
237
-
238
- Returns:
239
- --------
240
- scatter_plot : str
241
- File path to the PCA scatter plot.
242
- pair_plot : str
243
- File path to the pair plot of principal components.
244
- scree_plot : str
245
- File path to the scree plot of explained variance.
246
- """
247
- # Load data
248
  df = load_data('df_usa_health_features.parquet')
249
-
250
- # Preprocess data (optionally do feature selection)
251
  X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
252
-
253
- # Apply PCA
254
  X_pca, pca_model = apply_pca(X, n_components)
255
-
256
- # Generate and return visualizations
257
  scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
258
  return scatter_plot, pair_plot, scree_plot
259
 
260
- # ---------------------------------------------------------------------------
261
- # 6. BUILDING THE GRADIO APP
262
- # ---------------------------------------------------------------------------
263
  iface = gr.Interface(
264
  fn=gradio_interface,
265
  inputs=[
266
- gr.inputs.Textbox(label="Target Column", default="Median_Income_Household"),
267
- gr.inputs.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, default=5),
268
- gr.inputs.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, default=0)
269
  ],
270
  outputs=[
271
- gr.outputs.Image(type="file", label="PCA Scatter Plot"),
272
- gr.outputs.Image(type="file", label="PCA Pair Plot"),
273
- gr.outputs.Image(type="file", label="Scree Plot")
274
  ],
275
  title="PCA Visualization with DuckDB and Gradio",
276
  description=(
277
  "Load data from a Parquet file, optionally perform feature selection, "
278
- "run PCA, and visualize the results. "
279
- "1) Enter the target column name (e.g., 'Median_Income_Household'). "
280
- "2) Choose the number of PCA components. "
281
  "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
282
  )
283
  )
284
 
285
- # ---------------------------------------------------------------------------
286
- # 7. LAUNCH THE APPLICATION
287
- # ---------------------------------------------------------------------------
288
  if __name__ == "__main__":
289
  iface.launch()
 
9
  from sklearn.preprocessing import StandardScaler
10
  from sklearn.feature_selection import SelectKBest, f_regression
11
 
 
 
 
12
  def load_data(parquet_file: str) -> pd.DataFrame:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  con = duckdb.connect(database=':memory:')
14
  query = f"""
15
  CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
16
  """
17
  con.execute(query)
 
18
  df = con.execute("SELECT * FROM data").fetchdf()
19
  return df
20
 
21
+ def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  X = df.select_dtypes(include=[float, int]).copy()
23
  y = df[target_column].copy()
 
 
24
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
 
 
25
  X = X.fillna(X.median())
26
  y = y.fillna(y.median())
 
 
27
  if k_best is not None and k_best < X.shape[1]:
28
  selector = SelectKBest(score_func=f_regression, k=k_best)
29
  X_selected = selector.fit_transform(X, y)
 
30
  selected_indices = selector.get_support(indices=True)
31
  X = X.iloc[:, selected_indices]
 
32
  return X, y
33
 
34
+ def apply_pca(X: pd.DataFrame, n_components: int = 5):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  scaler = StandardScaler()
36
  X_scaled = scaler.fit_transform(X)
 
 
37
  pca = PCA(n_components=n_components)
38
  X_pca = pca.fit_transform(X_scaled)
 
39
  return X_pca, pca
40
 
41
+ def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  scatter_plot_file = 'pca_scatter.png'
43
  plt.figure(figsize=(10, 6))
44
+ plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
 
 
 
 
 
 
 
45
  plt.xlabel('Principal Component 1')
46
  plt.ylabel('Principal Component 2')
47
  plt.title('PCA - First Two Principal Components')
 
50
  plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
51
  plt.close()
52
 
53
+ # Pair plot
 
54
  pair_plot_file = 'pca_pairplot.png'
55
+ num_components = min(X_pca.shape[1], 5)
56
+ pca_df = pd.DataFrame(X_pca[:, :num_components],
57
+ columns=[f'PC{i+1}' for i in range(num_components)])
58
+ pca_df[target_label] = y.values
59
+ sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)],
60
+ hue=target_label, palette='viridis')
 
 
 
 
 
61
  plt.suptitle('Pair Plot of Principal Components', y=1.02)
62
  plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
63
  plt.close()
64
 
65
+ # Scree plot
66
  scree_plot_file = 'pca_scree.png'
67
  plt.figure(figsize=(8, 5))
68
+ plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
 
 
 
 
 
69
  plt.xlabel('Principal Components')
70
  plt.ylabel('Variance Explained')
71
  plt.title('Scree Plot')
 
75
 
76
  return scatter_plot_file, pair_plot_file, scree_plot_file
77
 
78
+ def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  df = load_data('df_usa_health_features.parquet')
 
 
80
  X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
 
 
81
  X_pca, pca_model = apply_pca(X, n_components)
 
 
82
  scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
83
  return scatter_plot, pair_plot, scree_plot
84
 
85
+ # ------------------------------------------------------------------------------
86
+ # HERE is the updated Gradio interface with direct component calls (no .inputs)
87
+ # ------------------------------------------------------------------------------
88
  iface = gr.Interface(
89
  fn=gradio_interface,
90
  inputs=[
91
+ gr.Textbox(label="Target Column", value="Median_Income_Household"),
92
+ gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
93
+ gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
94
  ],
95
  outputs=[
96
+ gr.Image(type="file", label="PCA Scatter Plot"),
97
+ gr.Image(type="file", label="PCA Pair Plot"),
98
+ gr.Image(type="file", label="Scree Plot")
99
  ],
100
  title="PCA Visualization with DuckDB and Gradio",
101
  description=(
102
  "Load data from a Parquet file, optionally perform feature selection, "
103
+ "run PCA, and visualize the results.\n"
104
+ "1) Enter the target column name (e.g., 'Median_Income_Household').\n"
105
+ "2) Choose the number of PCA components.\n"
106
  "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
107
  )
108
  )
109
 
 
 
 
110
  if __name__ == "__main__":
111
  iface.launch()