Update app.py
Browse files
app.py
CHANGED
@@ -1,33 +1,114 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import duckdb
|
4 |
-
from sklearn.decomposition import PCA
|
5 |
-
from sklearn.preprocessing import StandardScaler
|
6 |
-
from sklearn.feature_selection import SelectKBest, f_regression
|
7 |
import numpy as np
|
8 |
import matplotlib.pyplot as plt
|
9 |
import seaborn as sns
|
10 |
|
11 |
-
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
con = duckdb.connect(database=':memory:')
|
14 |
-
|
|
|
|
|
|
|
|
|
15 |
df = con.execute("SELECT * FROM data").fetchdf()
|
16 |
return df
|
17 |
|
18 |
-
#
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
X.replace([np.inf, -np.inf], np.nan, inplace=True)
|
26 |
|
27 |
-
#
|
28 |
X = X.fillna(X.median())
|
29 |
y = y.fillna(y.median())
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# Standardize the data
|
32 |
scaler = StandardScaler()
|
33 |
X_scaled = scaler.fit_transform(X)
|
@@ -36,53 +117,173 @@ def preprocess_and_pca(df, target_column, n_components=5):
|
|
36 |
pca = PCA(n_components=n_components)
|
37 |
X_pca = pca.fit_transform(X_scaled)
|
38 |
|
39 |
-
return X_pca,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
|
41 |
-
|
42 |
-
|
43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
44 |
plt.figure(figsize=(10, 6))
|
45 |
-
plt.scatter(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
46 |
plt.xlabel('Principal Component 1')
|
47 |
plt.ylabel('Principal Component 2')
|
48 |
plt.title('PCA - First Two Principal Components')
|
49 |
-
plt.colorbar(
|
50 |
-
|
|
|
51 |
plt.close()
|
52 |
|
53 |
-
#
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
plt.suptitle('Pair Plot of Principal Components', y=1.02)
|
60 |
-
plt.savefig('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
plt.close()
|
62 |
|
63 |
-
return
|
64 |
|
65 |
-
#
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
df = load_data('df_usa_health_features.parquet')
|
68 |
-
X_pca, y = preprocess_and_pca(df, target_column)
|
69 |
-
scatter_plot, pair_plot = visualize_pca(X_pca, y)
|
70 |
-
return scatter_plot, pair_plot
|
71 |
|
72 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
iface = gr.Interface(
|
74 |
fn=gradio_interface,
|
75 |
inputs=[
|
76 |
-
gr.inputs.Textbox(label="Target Column")
|
|
|
|
|
77 |
],
|
78 |
outputs=[
|
79 |
gr.outputs.Image(type="file", label="PCA Scatter Plot"),
|
80 |
-
gr.outputs.Image(type="file", label="PCA Pair Plot")
|
|
|
81 |
],
|
82 |
title="PCA Visualization with DuckDB and Gradio",
|
83 |
-
description=
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
)
|
85 |
|
86 |
-
#
|
|
|
|
|
87 |
if __name__ == "__main__":
|
88 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
import duckdb
|
|
|
|
|
|
|
4 |
import numpy as np
|
5 |
import matplotlib.pyplot as plt
|
6 |
import seaborn as sns
|
7 |
|
8 |
+
from sklearn.decomposition import PCA
|
9 |
+
from sklearn.preprocessing import StandardScaler
|
10 |
+
from sklearn.feature_selection import SelectKBest, f_regression
|
11 |
+
|
12 |
+
# ---------------------------------------------------------------------------
|
13 |
+
# 1. LOADING DATA
|
14 |
+
# ---------------------------------------------------------------------------
|
15 |
+
def load_data(parquet_file: str) -> pd.DataFrame:
|
16 |
+
"""
|
17 |
+
Load data from a Parquet file into a DuckDB in-memory database,
|
18 |
+
and return the result as a pandas DataFrame.
|
19 |
+
|
20 |
+
Parameters:
|
21 |
+
-----------
|
22 |
+
parquet_file : str
|
23 |
+
The path to the Parquet file to be loaded.
|
24 |
+
|
25 |
+
Returns:
|
26 |
+
--------
|
27 |
+
df : pd.DataFrame
|
28 |
+
Pandas DataFrame containing all columns from the Parquet file.
|
29 |
+
"""
|
30 |
con = duckdb.connect(database=':memory:')
|
31 |
+
query = f"""
|
32 |
+
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
|
33 |
+
"""
|
34 |
+
con.execute(query)
|
35 |
+
|
36 |
df = con.execute("SELECT * FROM data").fetchdf()
|
37 |
return df
|
38 |
|
39 |
+
# ---------------------------------------------------------------------------
|
40 |
+
# 2. DATA PREPROCESSING & OPTIONAL FEATURE SELECTION
|
41 |
+
# ---------------------------------------------------------------------------
|
42 |
+
def preprocess_data(
|
43 |
+
df: pd.DataFrame,
|
44 |
+
target_column: str,
|
45 |
+
k_best: int = None
|
46 |
+
):
|
47 |
+
"""
|
48 |
+
Perform data cleaning and (optionally) feature selection.
|
49 |
+
|
50 |
+
Parameters:
|
51 |
+
-----------
|
52 |
+
df : pd.DataFrame
|
53 |
+
The input DataFrame.
|
54 |
+
target_column : str
|
55 |
+
The name of the target variable in df.
|
56 |
+
k_best : int, optional
|
57 |
+
If provided, use SelectKBest with f_regression to select the
|
58 |
+
top k features. If None, no feature selection is performed.
|
59 |
|
60 |
+
Returns:
|
61 |
+
--------
|
62 |
+
X : pd.DataFrame
|
63 |
+
Preprocessed and optionally reduced DataFrame (features).
|
64 |
+
y : pd.Series
|
65 |
+
The target column as a Series.
|
66 |
+
"""
|
67 |
+
# Separate out the numeric columns for PCA
|
68 |
+
X = df.select_dtypes(include=[float, int]).copy()
|
69 |
+
y = df[target_column].copy()
|
70 |
+
|
71 |
+
# Replace infinities with NaN
|
72 |
X.replace([np.inf, -np.inf], np.nan, inplace=True)
|
73 |
|
74 |
+
# Impute missing values with the median
|
75 |
X = X.fillna(X.median())
|
76 |
y = y.fillna(y.median())
|
77 |
|
78 |
+
# Optional: Use SelectKBest to filter down to top k features
|
79 |
+
if k_best is not None and k_best < X.shape[1]:
|
80 |
+
selector = SelectKBest(score_func=f_regression, k=k_best)
|
81 |
+
X_selected = selector.fit_transform(X, y)
|
82 |
+
# We still want feature names to keep track of the columns
|
83 |
+
selected_indices = selector.get_support(indices=True)
|
84 |
+
X = X.iloc[:, selected_indices]
|
85 |
+
|
86 |
+
return X, y
|
87 |
+
|
88 |
+
# ---------------------------------------------------------------------------
|
89 |
+
# 3. PCA TRANSFORMATION
|
90 |
+
# ---------------------------------------------------------------------------
|
91 |
+
def apply_pca(
|
92 |
+
X: pd.DataFrame,
|
93 |
+
n_components: int = 5
|
94 |
+
):
|
95 |
+
"""
|
96 |
+
Standardize the data and apply PCA.
|
97 |
+
|
98 |
+
Parameters:
|
99 |
+
-----------
|
100 |
+
X : pd.DataFrame
|
101 |
+
The preprocessed numeric data.
|
102 |
+
n_components : int
|
103 |
+
Number of principal components to retain.
|
104 |
+
|
105 |
+
Returns:
|
106 |
+
--------
|
107 |
+
X_pca : np.ndarray
|
108 |
+
PCA-transformed dataset.
|
109 |
+
pca : PCA
|
110 |
+
The trained PCA model (can be used for explained_variance_ etc.).
|
111 |
+
"""
|
112 |
# Standardize the data
|
113 |
scaler = StandardScaler()
|
114 |
X_scaled = scaler.fit_transform(X)
|
|
|
117 |
pca = PCA(n_components=n_components)
|
118 |
X_pca = pca.fit_transform(X_scaled)
|
119 |
|
120 |
+
return X_pca, pca
|
121 |
+
|
122 |
+
# ---------------------------------------------------------------------------
|
123 |
+
# 4. VISUALIZATION
|
124 |
+
# ---------------------------------------------------------------------------
|
125 |
+
def visualize_pca(
|
126 |
+
X_pca: np.ndarray,
|
127 |
+
y: pd.Series,
|
128 |
+
pca,
|
129 |
+
target_label: str = 'Median Income Household'
|
130 |
+
):
|
131 |
+
"""
|
132 |
+
Generate visualizations:
|
133 |
+
1) Scatter plot of first two principal components.
|
134 |
+
2) Pair plot of the first few principal components (up to 5).
|
135 |
+
3) Scree plot showing the explained variance of each component.
|
136 |
+
|
137 |
+
Parameters:
|
138 |
+
-----------
|
139 |
+
X_pca : np.ndarray
|
140 |
+
PCA-transformed data (rows x components).
|
141 |
+
y : pd.Series
|
142 |
+
Target values, used for coloring points in plots.
|
143 |
+
pca : PCA
|
144 |
+
The fitted PCA model (for explained variance, etc.).
|
145 |
+
target_label : str
|
146 |
+
Label for the colorbar representing the target variable.
|
147 |
|
148 |
+
Returns:
|
149 |
+
--------
|
150 |
+
scatter_plot_file : str
|
151 |
+
Filename for the PCA scatter plot.
|
152 |
+
pair_plot_file : str
|
153 |
+
Filename for the PCA pair plot.
|
154 |
+
scree_plot_file : str
|
155 |
+
Filename for the Scree plot.
|
156 |
+
"""
|
157 |
+
# 4.a: Scatter plot for the first two components
|
158 |
+
scatter_plot_file = 'pca_scatter.png'
|
159 |
plt.figure(figsize=(10, 6))
|
160 |
+
plt.scatter(
|
161 |
+
X_pca[:, 0],
|
162 |
+
X_pca[:, 1],
|
163 |
+
c=y,
|
164 |
+
cmap='viridis',
|
165 |
+
edgecolor='k',
|
166 |
+
s=50
|
167 |
+
)
|
168 |
plt.xlabel('Principal Component 1')
|
169 |
plt.ylabel('Principal Component 2')
|
170 |
plt.title('PCA - First Two Principal Components')
|
171 |
+
cbar = plt.colorbar()
|
172 |
+
cbar.set_label(target_label)
|
173 |
+
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
|
174 |
plt.close()
|
175 |
|
176 |
+
# 4.b: Pair plot of the first few components
|
177 |
+
num_components = min(X_pca.shape[1], 5)
|
178 |
+
pair_plot_file = 'pca_pairplot.png'
|
179 |
+
pca_df = pd.DataFrame(
|
180 |
+
X_pca[:, :num_components],
|
181 |
+
columns=[f'PC{i+1}' for i in range(num_components)]
|
182 |
+
)
|
183 |
+
pca_df[target_label] = y.values # Add target for hue in pairplot
|
184 |
+
sns.pairplot(
|
185 |
+
pca_df,
|
186 |
+
vars=[f'PC{i+1}' for i in range(num_components)],
|
187 |
+
hue=target_label,
|
188 |
+
palette='viridis'
|
189 |
+
)
|
190 |
plt.suptitle('Pair Plot of Principal Components', y=1.02)
|
191 |
+
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
|
192 |
+
plt.close()
|
193 |
+
|
194 |
+
# 4.c: Scree plot for explained variance
|
195 |
+
scree_plot_file = 'pca_scree.png'
|
196 |
+
plt.figure(figsize=(8, 5))
|
197 |
+
plt.bar(
|
198 |
+
range(1, pca.n_components_ + 1),
|
199 |
+
pca.explained_variance_ratio_,
|
200 |
+
alpha=0.7,
|
201 |
+
color='red'
|
202 |
+
)
|
203 |
+
plt.xlabel('Principal Components')
|
204 |
+
plt.ylabel('Variance Explained')
|
205 |
+
plt.title('Scree Plot')
|
206 |
+
plt.xticks(range(1, pca.n_components_ + 1))
|
207 |
+
plt.savefig(scree_plot_file, dpi=100, bbox_inches='tight')
|
208 |
plt.close()
|
209 |
|
210 |
+
return scatter_plot_file, pair_plot_file, scree_plot_file
|
211 |
|
212 |
+
# ---------------------------------------------------------------------------
|
213 |
+
# 5. GRADIO INTERFACE FUNCTION
|
214 |
+
# ---------------------------------------------------------------------------
|
215 |
+
def gradio_interface(
|
216 |
+
target_column: str,
|
217 |
+
n_components: int = 5,
|
218 |
+
k_best: int = 0
|
219 |
+
):
|
220 |
+
"""
|
221 |
+
Main interface function that:
|
222 |
+
- Loads the Parquet data
|
223 |
+
- Preprocesses the data (optionally using SelectKBest feature selection)
|
224 |
+
- Applies PCA
|
225 |
+
- Visualizes the PCA results
|
226 |
+
- Returns file paths to the generated plots.
|
227 |
+
|
228 |
+
Parameters:
|
229 |
+
-----------
|
230 |
+
target_column : str
|
231 |
+
The name of the target column in the DataFrame.
|
232 |
+
n_components : int
|
233 |
+
Number of PCA components to use.
|
234 |
+
k_best : int
|
235 |
+
If > 0, select the top k_best features before PCA.
|
236 |
+
If 0 or None, no feature selection is performed.
|
237 |
+
|
238 |
+
Returns:
|
239 |
+
--------
|
240 |
+
scatter_plot : str
|
241 |
+
File path to the PCA scatter plot.
|
242 |
+
pair_plot : str
|
243 |
+
File path to the pair plot of principal components.
|
244 |
+
scree_plot : str
|
245 |
+
File path to the scree plot of explained variance.
|
246 |
+
"""
|
247 |
+
# Load data
|
248 |
df = load_data('df_usa_health_features.parquet')
|
|
|
|
|
|
|
249 |
|
250 |
+
# Preprocess data (optionally do feature selection)
|
251 |
+
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
|
252 |
+
|
253 |
+
# Apply PCA
|
254 |
+
X_pca, pca_model = apply_pca(X, n_components)
|
255 |
+
|
256 |
+
# Generate and return visualizations
|
257 |
+
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
|
258 |
+
return scatter_plot, pair_plot, scree_plot
|
259 |
+
|
260 |
+
# ---------------------------------------------------------------------------
|
261 |
+
# 6. BUILDING THE GRADIO APP
|
262 |
+
# ---------------------------------------------------------------------------
|
263 |
iface = gr.Interface(
|
264 |
fn=gradio_interface,
|
265 |
inputs=[
|
266 |
+
gr.inputs.Textbox(label="Target Column", default="Median_Income_Household"),
|
267 |
+
gr.inputs.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, default=5),
|
268 |
+
gr.inputs.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, default=0)
|
269 |
],
|
270 |
outputs=[
|
271 |
gr.outputs.Image(type="file", label="PCA Scatter Plot"),
|
272 |
+
gr.outputs.Image(type="file", label="PCA Pair Plot"),
|
273 |
+
gr.outputs.Image(type="file", label="Scree Plot")
|
274 |
],
|
275 |
title="PCA Visualization with DuckDB and Gradio",
|
276 |
+
description=(
|
277 |
+
"Load data from a Parquet file, optionally perform feature selection, "
|
278 |
+
"run PCA, and visualize the results. "
|
279 |
+
"1) Enter the target column name (e.g., 'Median_Income_Household'). "
|
280 |
+
"2) Choose the number of PCA components. "
|
281 |
+
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
|
282 |
+
)
|
283 |
)
|
284 |
|
285 |
+
# ---------------------------------------------------------------------------
|
286 |
+
# 7. LAUNCH THE APPLICATION
|
287 |
+
# ---------------------------------------------------------------------------
|
288 |
if __name__ == "__main__":
|
289 |
+
iface.launch()
|