LeonceNsh commited on
Commit
bed9d39
·
verified ·
1 Parent(s): 70b6418

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +240 -39
app.py CHANGED
@@ -1,33 +1,114 @@
1
  import gradio as gr
2
  import pandas as pd
3
  import duckdb
4
- from sklearn.decomposition import PCA
5
- from sklearn.preprocessing import StandardScaler
6
- from sklearn.feature_selection import SelectKBest, f_regression
7
  import numpy as np
8
  import matplotlib.pyplot as plt
9
  import seaborn as sns
10
 
11
- # Function to load data from a Parquet file into a DuckDB in-memory database
12
- def load_data(parquet_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  con = duckdb.connect(database=':memory:')
14
- con.execute(f"CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')")
 
 
 
 
15
  df = con.execute("SELECT * FROM data").fetchdf()
16
  return df
17
 
18
- # Function to preprocess data and perform PCA
19
- def preprocess_and_pca(df, target_column, n_components=5):
20
- # Drop non-numeric columns
21
- X = df.select_dtypes(include=[float, int])
22
- y = df[target_column]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- # Replace infinity values with NaN
 
 
 
 
 
 
 
 
 
 
 
25
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
26
 
27
- # Handle missing values by imputing with the median
28
  X = X.fillna(X.median())
29
  y = y.fillna(y.median())
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # Standardize the data
32
  scaler = StandardScaler()
33
  X_scaled = scaler.fit_transform(X)
@@ -36,53 +117,173 @@ def preprocess_and_pca(df, target_column, n_components=5):
36
  pca = PCA(n_components=n_components)
37
  X_pca = pca.fit_transform(X_scaled)
38
 
39
- return X_pca, y
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- # Function to visualize the PCA components
42
- def visualize_pca(X_pca, y):
43
- # Visualize the first two principal components
 
 
 
 
 
 
 
 
44
  plt.figure(figsize=(10, 6))
45
- plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
 
 
 
 
 
 
 
46
  plt.xlabel('Principal Component 1')
47
  plt.ylabel('Principal Component 2')
48
  plt.title('PCA - First Two Principal Components')
49
- plt.colorbar(label='Median Income Household')
50
- plt.savefig('pca_scatter.png')
 
51
  plt.close()
52
 
53
- # Create a DataFrame with the first few principal components for pair plot
54
- pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(X_pca.shape[1])])
55
- pca_df['Median_Income_Household'] = y
56
-
57
- # Pair plot of the first few principal components
58
- sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(5)], hue='Median_Income_Household', palette='viridis')
 
 
 
 
 
 
 
 
59
  plt.suptitle('Pair Plot of Principal Components', y=1.02)
60
- plt.savefig('pca_pairplot.png')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  plt.close()
62
 
63
- return 'pca_scatter.png', 'pca_pairplot.png'
64
 
65
- # Gradio interface function
66
- def gradio_interface(target_column):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  df = load_data('df_usa_health_features.parquet')
68
- X_pca, y = preprocess_and_pca(df, target_column)
69
- scatter_plot, pair_plot = visualize_pca(X_pca, y)
70
- return scatter_plot, pair_plot
71
 
72
- # Create Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
73
  iface = gr.Interface(
74
  fn=gradio_interface,
75
  inputs=[
76
- gr.inputs.Textbox(label="Target Column")
 
 
77
  ],
78
  outputs=[
79
  gr.outputs.Image(type="file", label="PCA Scatter Plot"),
80
- gr.outputs.Image(type="file", label="PCA Pair Plot")
 
81
  ],
82
  title="PCA Visualization with DuckDB and Gradio",
83
- description="Specify the target column to visualize PCA components from the df_usa_health_features.parquet file."
 
 
 
 
 
 
84
  )
85
 
86
- # Launch the Gradio app
 
 
87
  if __name__ == "__main__":
88
- iface.launch()
 
1
  import gradio as gr
2
  import pandas as pd
3
  import duckdb
 
 
 
4
  import numpy as np
5
  import matplotlib.pyplot as plt
6
  import seaborn as sns
7
 
8
+ from sklearn.decomposition import PCA
9
+ from sklearn.preprocessing import StandardScaler
10
+ from sklearn.feature_selection import SelectKBest, f_regression
11
+
12
+ # ---------------------------------------------------------------------------
13
+ # 1. LOADING DATA
14
+ # ---------------------------------------------------------------------------
15
+ def load_data(parquet_file: str) -> pd.DataFrame:
16
+ """
17
+ Load data from a Parquet file into a DuckDB in-memory database,
18
+ and return the result as a pandas DataFrame.
19
+
20
+ Parameters:
21
+ -----------
22
+ parquet_file : str
23
+ The path to the Parquet file to be loaded.
24
+
25
+ Returns:
26
+ --------
27
+ df : pd.DataFrame
28
+ Pandas DataFrame containing all columns from the Parquet file.
29
+ """
30
  con = duckdb.connect(database=':memory:')
31
+ query = f"""
32
+ CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
33
+ """
34
+ con.execute(query)
35
+
36
  df = con.execute("SELECT * FROM data").fetchdf()
37
  return df
38
 
39
+ # ---------------------------------------------------------------------------
40
+ # 2. DATA PREPROCESSING & OPTIONAL FEATURE SELECTION
41
+ # ---------------------------------------------------------------------------
42
+ def preprocess_data(
43
+ df: pd.DataFrame,
44
+ target_column: str,
45
+ k_best: int = None
46
+ ):
47
+ """
48
+ Perform data cleaning and (optionally) feature selection.
49
+
50
+ Parameters:
51
+ -----------
52
+ df : pd.DataFrame
53
+ The input DataFrame.
54
+ target_column : str
55
+ The name of the target variable in df.
56
+ k_best : int, optional
57
+ If provided, use SelectKBest with f_regression to select the
58
+ top k features. If None, no feature selection is performed.
59
 
60
+ Returns:
61
+ --------
62
+ X : pd.DataFrame
63
+ Preprocessed and optionally reduced DataFrame (features).
64
+ y : pd.Series
65
+ The target column as a Series.
66
+ """
67
+ # Separate out the numeric columns for PCA
68
+ X = df.select_dtypes(include=[float, int]).copy()
69
+ y = df[target_column].copy()
70
+
71
+ # Replace infinities with NaN
72
  X.replace([np.inf, -np.inf], np.nan, inplace=True)
73
 
74
+ # Impute missing values with the median
75
  X = X.fillna(X.median())
76
  y = y.fillna(y.median())
77
 
78
+ # Optional: Use SelectKBest to filter down to top k features
79
+ if k_best is not None and k_best < X.shape[1]:
80
+ selector = SelectKBest(score_func=f_regression, k=k_best)
81
+ X_selected = selector.fit_transform(X, y)
82
+ # We still want feature names to keep track of the columns
83
+ selected_indices = selector.get_support(indices=True)
84
+ X = X.iloc[:, selected_indices]
85
+
86
+ return X, y
87
+
88
+ # ---------------------------------------------------------------------------
89
+ # 3. PCA TRANSFORMATION
90
+ # ---------------------------------------------------------------------------
91
+ def apply_pca(
92
+ X: pd.DataFrame,
93
+ n_components: int = 5
94
+ ):
95
+ """
96
+ Standardize the data and apply PCA.
97
+
98
+ Parameters:
99
+ -----------
100
+ X : pd.DataFrame
101
+ The preprocessed numeric data.
102
+ n_components : int
103
+ Number of principal components to retain.
104
+
105
+ Returns:
106
+ --------
107
+ X_pca : np.ndarray
108
+ PCA-transformed dataset.
109
+ pca : PCA
110
+ The trained PCA model (can be used for explained_variance_ etc.).
111
+ """
112
  # Standardize the data
113
  scaler = StandardScaler()
114
  X_scaled = scaler.fit_transform(X)
 
117
  pca = PCA(n_components=n_components)
118
  X_pca = pca.fit_transform(X_scaled)
119
 
120
+ return X_pca, pca
121
+
122
+ # ---------------------------------------------------------------------------
123
+ # 4. VISUALIZATION
124
+ # ---------------------------------------------------------------------------
125
+ def visualize_pca(
126
+ X_pca: np.ndarray,
127
+ y: pd.Series,
128
+ pca,
129
+ target_label: str = 'Median Income Household'
130
+ ):
131
+ """
132
+ Generate visualizations:
133
+ 1) Scatter plot of first two principal components.
134
+ 2) Pair plot of the first few principal components (up to 5).
135
+ 3) Scree plot showing the explained variance of each component.
136
+
137
+ Parameters:
138
+ -----------
139
+ X_pca : np.ndarray
140
+ PCA-transformed data (rows x components).
141
+ y : pd.Series
142
+ Target values, used for coloring points in plots.
143
+ pca : PCA
144
+ The fitted PCA model (for explained variance, etc.).
145
+ target_label : str
146
+ Label for the colorbar representing the target variable.
147
 
148
+ Returns:
149
+ --------
150
+ scatter_plot_file : str
151
+ Filename for the PCA scatter plot.
152
+ pair_plot_file : str
153
+ Filename for the PCA pair plot.
154
+ scree_plot_file : str
155
+ Filename for the Scree plot.
156
+ """
157
+ # 4.a: Scatter plot for the first two components
158
+ scatter_plot_file = 'pca_scatter.png'
159
  plt.figure(figsize=(10, 6))
160
+ plt.scatter(
161
+ X_pca[:, 0],
162
+ X_pca[:, 1],
163
+ c=y,
164
+ cmap='viridis',
165
+ edgecolor='k',
166
+ s=50
167
+ )
168
  plt.xlabel('Principal Component 1')
169
  plt.ylabel('Principal Component 2')
170
  plt.title('PCA - First Two Principal Components')
171
+ cbar = plt.colorbar()
172
+ cbar.set_label(target_label)
173
+ plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
174
  plt.close()
175
 
176
+ # 4.b: Pair plot of the first few components
177
+ num_components = min(X_pca.shape[1], 5)
178
+ pair_plot_file = 'pca_pairplot.png'
179
+ pca_df = pd.DataFrame(
180
+ X_pca[:, :num_components],
181
+ columns=[f'PC{i+1}' for i in range(num_components)]
182
+ )
183
+ pca_df[target_label] = y.values # Add target for hue in pairplot
184
+ sns.pairplot(
185
+ pca_df,
186
+ vars=[f'PC{i+1}' for i in range(num_components)],
187
+ hue=target_label,
188
+ palette='viridis'
189
+ )
190
  plt.suptitle('Pair Plot of Principal Components', y=1.02)
191
+ plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
192
+ plt.close()
193
+
194
+ # 4.c: Scree plot for explained variance
195
+ scree_plot_file = 'pca_scree.png'
196
+ plt.figure(figsize=(8, 5))
197
+ plt.bar(
198
+ range(1, pca.n_components_ + 1),
199
+ pca.explained_variance_ratio_,
200
+ alpha=0.7,
201
+ color='red'
202
+ )
203
+ plt.xlabel('Principal Components')
204
+ plt.ylabel('Variance Explained')
205
+ plt.title('Scree Plot')
206
+ plt.xticks(range(1, pca.n_components_ + 1))
207
+ plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight')
208
  plt.close()
209
 
210
+ return scatter_plot_file, pair_plot_file, scree_plot_file
211
 
212
+ # ---------------------------------------------------------------------------
213
+ # 5. GRADIO INTERFACE FUNCTION
214
+ # ---------------------------------------------------------------------------
215
+ def gradio_interface(
216
+ target_column: str,
217
+ n_components: int = 5,
218
+ k_best: int = 0
219
+ ):
220
+ """
221
+ Main interface function that:
222
+ - Loads the Parquet data
223
+ - Preprocesses the data (optionally using SelectKBest feature selection)
224
+ - Applies PCA
225
+ - Visualizes the PCA results
226
+ - Returns file paths to the generated plots.
227
+
228
+ Parameters:
229
+ -----------
230
+ target_column : str
231
+ The name of the target column in the DataFrame.
232
+ n_components : int
233
+ Number of PCA components to use.
234
+ k_best : int
235
+ If > 0, select the top k_best features before PCA.
236
+ If 0 or None, no feature selection is performed.
237
+
238
+ Returns:
239
+ --------
240
+ scatter_plot : str
241
+ File path to the PCA scatter plot.
242
+ pair_plot : str
243
+ File path to the pair plot of principal components.
244
+ scree_plot : str
245
+ File path to the scree plot of explained variance.
246
+ """
247
+ # Load data
248
  df = load_data('df_usa_health_features.parquet')
 
 
 
249
 
250
+ # Preprocess data (optionally do feature selection)
251
+ X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
252
+
253
+ # Apply PCA
254
+ X_pca, pca_model = apply_pca(X, n_components)
255
+
256
+ # Generate and return visualizations
257
+ scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
258
+ return scatter_plot, pair_plot, scree_plot
259
+
260
+ # ---------------------------------------------------------------------------
261
+ # 6. BUILDING THE GRADIO APP
262
+ # ---------------------------------------------------------------------------
263
  iface = gr.Interface(
264
  fn=gradio_interface,
265
  inputs=[
266
+ gr.inputs.Textbox(label="Target Column", default="Median_Income_Household"),
267
+ gr.inputs.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, default=5),
268
+ gr.inputs.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, default=0)
269
  ],
270
  outputs=[
271
  gr.outputs.Image(type="file", label="PCA Scatter Plot"),
272
+ gr.outputs.Image(type="file", label="PCA Pair Plot"),
273
+ gr.outputs.Image(type="file", label="Scree Plot")
274
  ],
275
  title="PCA Visualization with DuckDB and Gradio",
276
+ description=(
277
+ "Load data from a Parquet file, optionally perform feature selection, "
278
+ "run PCA, and visualize the results. "
279
+ "1) Enter the target column name (e.g., 'Median_Income_Household'). "
280
+ "2) Choose the number of PCA components. "
281
+ "3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
282
+ )
283
  )
284
 
285
+ # ---------------------------------------------------------------------------
286
+ # 7. LAUNCH THE APPLICATION
287
+ # ---------------------------------------------------------------------------
288
  if __name__ == "__main__":
289
+ iface.launch()