Update app.py
Browse files
app.py
CHANGED
@@ -9,162 +9,39 @@ from sklearn.decomposition import PCA
|
|
9 |
from sklearn.preprocessing import StandardScaler
|
10 |
from sklearn.feature_selection import SelectKBest, f_regression
|
11 |
|
12 |
-
# ---------------------------------------------------------------------------
|
13 |
-
# 1. LOADING DATA
|
14 |
-
# ---------------------------------------------------------------------------
|
15 |
def load_data(parquet_file: str) -> pd.DataFrame:
|
16 |
-
"""
|
17 |
-
Load data from a Parquet file into a DuckDB in-memory database,
|
18 |
-
and return the result as a pandas DataFrame.
|
19 |
-
|
20 |
-
Parameters:
|
21 |
-
-----------
|
22 |
-
parquet_file : str
|
23 |
-
The path to the Parquet file to be loaded.
|
24 |
-
|
25 |
-
Returns:
|
26 |
-
--------
|
27 |
-
df : pd.DataFrame
|
28 |
-
Pandas DataFrame containing all columns from the Parquet file.
|
29 |
-
"""
|
30 |
con = duckdb.connect(database=':memory:')
|
31 |
query = f"""
|
32 |
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
|
33 |
"""
|
34 |
con.execute(query)
|
35 |
-
|
36 |
df = con.execute("SELECT * FROM data").fetchdf()
|
37 |
return df
|
38 |
|
39 |
-
|
40 |
-
# 2. DATA PREPROCESSING & OPTIONAL FEATURE SELECTION
|
41 |
-
# ---------------------------------------------------------------------------
|
42 |
-
def preprocess_data(
|
43 |
-
df: pd.DataFrame,
|
44 |
-
target_column: str,
|
45 |
-
k_best: int = None
|
46 |
-
):
|
47 |
-
"""
|
48 |
-
Perform data cleaning and (optionally) feature selection.
|
49 |
-
|
50 |
-
Parameters:
|
51 |
-
-----------
|
52 |
-
df : pd.DataFrame
|
53 |
-
The input DataFrame.
|
54 |
-
target_column : str
|
55 |
-
The name of the target variable in df.
|
56 |
-
k_best : int, optional
|
57 |
-
If provided, use SelectKBest with f_regression to select the
|
58 |
-
top k features. If None, no feature selection is performed.
|
59 |
-
|
60 |
-
Returns:
|
61 |
-
--------
|
62 |
-
X : pd.DataFrame
|
63 |
-
Preprocessed and optionally reduced DataFrame (features).
|
64 |
-
y : pd.Series
|
65 |
-
The target column as a Series.
|
66 |
-
"""
|
67 |
-
# Separate out the numeric columns for PCA
|
68 |
X = df.select_dtypes(include=[float, int]).copy()
|
69 |
y = df[target_column].copy()
|
70 |
-
|
71 |
-
# Replace infinities with NaN
|
72 |
X.replace([np.inf, -np.inf], np.nan, inplace=True)
|
73 |
-
|
74 |
-
# Impute missing values with the median
|
75 |
X = X.fillna(X.median())
|
76 |
y = y.fillna(y.median())
|
77 |
-
|
78 |
-
# Optional: Use SelectKBest to filter down to top k features
|
79 |
if k_best is not None and k_best < X.shape[1]:
|
80 |
selector = SelectKBest(score_func=f_regression, k=k_best)
|
81 |
X_selected = selector.fit_transform(X, y)
|
82 |
-
# We still want feature names to keep track of the columns
|
83 |
selected_indices = selector.get_support(indices=True)
|
84 |
X = X.iloc[:, selected_indices]
|
85 |
-
|
86 |
return X, y
|
87 |
|
88 |
-
|
89 |
-
# 3. PCA TRANSFORMATION
|
90 |
-
# ---------------------------------------------------------------------------
|
91 |
-
def apply_pca(
|
92 |
-
X: pd.DataFrame,
|
93 |
-
n_components: int = 5
|
94 |
-
):
|
95 |
-
"""
|
96 |
-
Standardize the data and apply PCA.
|
97 |
-
|
98 |
-
Parameters:
|
99 |
-
-----------
|
100 |
-
X : pd.DataFrame
|
101 |
-
The preprocessed numeric data.
|
102 |
-
n_components : int
|
103 |
-
Number of principal components to retain.
|
104 |
-
|
105 |
-
Returns:
|
106 |
-
--------
|
107 |
-
X_pca : np.ndarray
|
108 |
-
PCA-transformed dataset.
|
109 |
-
pca : PCA
|
110 |
-
The trained PCA model (can be used for explained_variance_ etc.).
|
111 |
-
"""
|
112 |
-
# Standardize the data
|
113 |
scaler = StandardScaler()
|
114 |
X_scaled = scaler.fit_transform(X)
|
115 |
-
|
116 |
-
# Apply PCA
|
117 |
pca = PCA(n_components=n_components)
|
118 |
X_pca = pca.fit_transform(X_scaled)
|
119 |
-
|
120 |
return X_pca, pca
|
121 |
|
122 |
-
|
123 |
-
# 4. VISUALIZATION
|
124 |
-
# ---------------------------------------------------------------------------
|
125 |
-
def visualize_pca(
|
126 |
-
X_pca: np.ndarray,
|
127 |
-
y: pd.Series,
|
128 |
-
pca,
|
129 |
-
target_label: str = 'Median Income Household'
|
130 |
-
):
|
131 |
-
"""
|
132 |
-
Generate visualizations:
|
133 |
-
1) Scatter plot of first two principal components.
|
134 |
-
2) Pair plot of the first few principal components (up to 5).
|
135 |
-
3) Scree plot showing the explained variance of each component.
|
136 |
-
|
137 |
-
Parameters:
|
138 |
-
-----------
|
139 |
-
X_pca : np.ndarray
|
140 |
-
PCA-transformed data (rows x components).
|
141 |
-
y : pd.Series
|
142 |
-
Target values, used for coloring points in plots.
|
143 |
-
pca : PCA
|
144 |
-
The fitted PCA model (for explained variance, etc.).
|
145 |
-
target_label : str
|
146 |
-
Label for the colorbar representing the target variable.
|
147 |
-
|
148 |
-
Returns:
|
149 |
-
--------
|
150 |
-
scatter_plot_file : str
|
151 |
-
Filename for the PCA scatter plot.
|
152 |
-
pair_plot_file : str
|
153 |
-
Filename for the PCA pair plot.
|
154 |
-
scree_plot_file : str
|
155 |
-
Filename for the Scree plot.
|
156 |
-
"""
|
157 |
-
# 4.a: Scatter plot for the first two components
|
158 |
scatter_plot_file = 'pca_scatter.png'
|
159 |
plt.figure(figsize=(10, 6))
|
160 |
-
plt.scatter(
|
161 |
-
X_pca[:, 0],
|
162 |
-
X_pca[:, 1],
|
163 |
-
c=y,
|
164 |
-
cmap='viridis',
|
165 |
-
edgecolor='k',
|
166 |
-
s=50
|
167 |
-
)
|
168 |
plt.xlabel('Principal Component 1')
|
169 |
plt.ylabel('Principal Component 2')
|
170 |
plt.title('PCA - First Two Principal Components')
|
@@ -173,33 +50,22 @@ def visualize_pca(
|
|
173 |
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
|
174 |
plt.close()
|
175 |
|
176 |
-
#
|
177 |
-
num_components = min(X_pca.shape[1], 5)
|
178 |
pair_plot_file = 'pca_pairplot.png'
|
179 |
-
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
pca_df
|
184 |
-
|
185 |
-
pca_df,
|
186 |
-
vars=[f'PC{i+1}' for i in range(num_components)],
|
187 |
-
hue=target_label,
|
188 |
-
palette='viridis'
|
189 |
-
)
|
190 |
plt.suptitle('Pair Plot of Principal Components', y=1.02)
|
191 |
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
|
192 |
plt.close()
|
193 |
|
194 |
-
#
|
195 |
scree_plot_file = 'pca_scree.png'
|
196 |
plt.figure(figsize=(8, 5))
|
197 |
-
plt.bar(
|
198 |
-
range(1, pca.n_components_ + 1),
|
199 |
-
pca.explained_variance_ratio_,
|
200 |
-
alpha=0.7,
|
201 |
-
color='red'
|
202 |
-
)
|
203 |
plt.xlabel('Principal Components')
|
204 |
plt.ylabel('Variance Explained')
|
205 |
plt.title('Scree Plot')
|
@@ -209,81 +75,37 @@ def visualize_pca(
|
|
209 |
|
210 |
return scatter_plot_file, pair_plot_file, scree_plot_file
|
211 |
|
212 |
-
|
213 |
-
# 5. GRADIO INTERFACE FUNCTION
|
214 |
-
# ---------------------------------------------------------------------------
|
215 |
-
def gradio_interface(
|
216 |
-
target_column: str,
|
217 |
-
n_components: int = 5,
|
218 |
-
k_best: int = 0
|
219 |
-
):
|
220 |
-
"""
|
221 |
-
Main interface function that:
|
222 |
-
- Loads the Parquet data
|
223 |
-
- Preprocesses the data (optionally using SelectKBest feature selection)
|
224 |
-
- Applies PCA
|
225 |
-
- Visualizes the PCA results
|
226 |
-
- Returns file paths to the generated plots.
|
227 |
-
|
228 |
-
Parameters:
|
229 |
-
-----------
|
230 |
-
target_column : str
|
231 |
-
The name of the target column in the DataFrame.
|
232 |
-
n_components : int
|
233 |
-
Number of PCA components to use.
|
234 |
-
k_best : int
|
235 |
-
If > 0, select the top k_best features before PCA.
|
236 |
-
If 0 or None, no feature selection is performed.
|
237 |
-
|
238 |
-
Returns:
|
239 |
-
--------
|
240 |
-
scatter_plot : str
|
241 |
-
File path to the PCA scatter plot.
|
242 |
-
pair_plot : str
|
243 |
-
File path to the pair plot of principal components.
|
244 |
-
scree_plot : str
|
245 |
-
File path to the scree plot of explained variance.
|
246 |
-
"""
|
247 |
-
# Load data
|
248 |
df = load_data('df_usa_health_features.parquet')
|
249 |
-
|
250 |
-
# Preprocess data (optionally do feature selection)
|
251 |
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
|
252 |
-
|
253 |
-
# Apply PCA
|
254 |
X_pca, pca_model = apply_pca(X, n_components)
|
255 |
-
|
256 |
-
# Generate and return visualizations
|
257 |
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
|
258 |
return scatter_plot, pair_plot, scree_plot
|
259 |
|
260 |
-
#
|
261 |
-
#
|
262 |
-
#
|
263 |
iface = gr.Interface(
|
264 |
fn=gradio_interface,
|
265 |
inputs=[
|
266 |
-
gr.
|
267 |
-
gr.
|
268 |
-
gr.
|
269 |
],
|
270 |
outputs=[
|
271 |
-
gr.
|
272 |
-
gr.
|
273 |
-
gr.
|
274 |
],
|
275 |
title="PCA Visualization with DuckDB and Gradio",
|
276 |
description=(
|
277 |
"Load data from a Parquet file, optionally perform feature selection, "
|
278 |
-
"run PCA, and visualize the results
|
279 |
-
"1) Enter the target column name (e.g., 'Median_Income_Household')
|
280 |
-
"2) Choose the number of PCA components
|
281 |
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
|
282 |
)
|
283 |
)
|
284 |
|
285 |
-
# ---------------------------------------------------------------------------
|
286 |
-
# 7. LAUNCH THE APPLICATION
|
287 |
-
# ---------------------------------------------------------------------------
|
288 |
if __name__ == "__main__":
|
289 |
iface.launch()
|
|
|
9 |
from sklearn.preprocessing import StandardScaler
|
10 |
from sklearn.feature_selection import SelectKBest, f_regression
|
11 |
|
|
|
|
|
|
|
12 |
def load_data(parquet_file: str) -> pd.DataFrame:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
con = duckdb.connect(database=':memory:')
|
14 |
query = f"""
|
15 |
CREATE TABLE data AS SELECT * FROM read_parquet('{parquet_file}')
|
16 |
"""
|
17 |
con.execute(query)
|
|
|
18 |
df = con.execute("SELECT * FROM data").fetchdf()
|
19 |
return df
|
20 |
|
21 |
+
def preprocess_data(df: pd.DataFrame, target_column: str, k_best: int = None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
X = df.select_dtypes(include=[float, int]).copy()
|
23 |
y = df[target_column].copy()
|
|
|
|
|
24 |
X.replace([np.inf, -np.inf], np.nan, inplace=True)
|
|
|
|
|
25 |
X = X.fillna(X.median())
|
26 |
y = y.fillna(y.median())
|
|
|
|
|
27 |
if k_best is not None and k_best < X.shape[1]:
|
28 |
selector = SelectKBest(score_func=f_regression, k=k_best)
|
29 |
X_selected = selector.fit_transform(X, y)
|
|
|
30 |
selected_indices = selector.get_support(indices=True)
|
31 |
X = X.iloc[:, selected_indices]
|
|
|
32 |
return X, y
|
33 |
|
34 |
+
def apply_pca(X: pd.DataFrame, n_components: int = 5):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
scaler = StandardScaler()
|
36 |
X_scaled = scaler.fit_transform(X)
|
|
|
|
|
37 |
pca = PCA(n_components=n_components)
|
38 |
X_pca = pca.fit_transform(X_scaled)
|
|
|
39 |
return X_pca, pca
|
40 |
|
41 |
+
def visualize_pca(X_pca: np.ndarray, y: pd.Series, pca, target_label: str = 'Median Income Household'):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
scatter_plot_file = 'pca_scatter.png'
|
43 |
plt.figure(figsize=(10, 6))
|
44 |
+
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y, cmap='viridis', edgecolor='k', s=50)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
45 |
plt.xlabel('Principal Component 1')
|
46 |
plt.ylabel('Principal Component 2')
|
47 |
plt.title('PCA - First Two Principal Components')
|
|
|
50 |
plt.savefig(scatter_plot_file, dpi=100, bbox_inches='tight')
|
51 |
plt.close()
|
52 |
|
53 |
+
# Pair plot
|
|
|
54 |
pair_plot_file = 'pca_pairplot.png'
|
55 |
+
num_components = min(X_pca.shape[1], 5)
|
56 |
+
pca_df = pd.DataFrame(X_pca[:, :num_components],
|
57 |
+
columns=[f'PC{i+1}' for i in range(num_components)])
|
58 |
+
pca_df[target_label] = y.values
|
59 |
+
sns.pairplot(pca_df, vars=[f'PC{i+1}' for i in range(num_components)],
|
60 |
+
hue=target_label, palette='viridis')
|
|
|
|
|
|
|
|
|
|
|
61 |
plt.suptitle('Pair Plot of Principal Components', y=1.02)
|
62 |
plt.savefig(pair_plot_file, dpi=100, bbox_inches='tight')
|
63 |
plt.close()
|
64 |
|
65 |
+
# Scree plot
|
66 |
scree_plot_file = 'pca_scree.png'
|
67 |
plt.figure(figsize=(8, 5))
|
68 |
+
plt.bar(range(1, pca.n_components_ + 1), pca.explained_variance_ratio_, alpha=0.7, color='red')
|
|
|
|
|
|
|
|
|
|
|
69 |
plt.xlabel('Principal Components')
|
70 |
plt.ylabel('Variance Explained')
|
71 |
plt.title('Scree Plot')
|
|
|
75 |
|
76 |
return scatter_plot_file, pair_plot_file, scree_plot_file
|
77 |
|
78 |
+
def gradio_interface(target_column: str, n_components: int = 5, k_best: int = 0):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
df = load_data('df_usa_health_features.parquet')
|
|
|
|
|
80 |
X, y = preprocess_data(df, target_column, k_best if k_best > 0 else None)
|
|
|
|
|
81 |
X_pca, pca_model = apply_pca(X, n_components)
|
|
|
|
|
82 |
scatter_plot, pair_plot, scree_plot = visualize_pca(X_pca, y, pca_model, target_label=target_column)
|
83 |
return scatter_plot, pair_plot, scree_plot
|
84 |
|
85 |
+
# ------------------------------------------------------------------------------
|
86 |
+
# HERE is the updated Gradio interface with direct component calls (no .inputs)
|
87 |
+
# ------------------------------------------------------------------------------
|
88 |
iface = gr.Interface(
|
89 |
fn=gradio_interface,
|
90 |
inputs=[
|
91 |
+
gr.Textbox(label="Target Column", value="Median_Income_Household"),
|
92 |
+
gr.Slider(label="Number of PCA Components", minimum=2, maximum=20, step=1, value=5),
|
93 |
+
gr.Slider(label="SelectKBest (0 to skip)", minimum=0, maximum=50, step=1, value=0)
|
94 |
],
|
95 |
outputs=[
|
96 |
+
gr.Image(type="file", label="PCA Scatter Plot"),
|
97 |
+
gr.Image(type="file", label="PCA Pair Plot"),
|
98 |
+
gr.Image(type="file", label="Scree Plot")
|
99 |
],
|
100 |
title="PCA Visualization with DuckDB and Gradio",
|
101 |
description=(
|
102 |
"Load data from a Parquet file, optionally perform feature selection, "
|
103 |
+
"run PCA, and visualize the results.\n"
|
104 |
+
"1) Enter the target column name (e.g., 'Median_Income_Household').\n"
|
105 |
+
"2) Choose the number of PCA components.\n"
|
106 |
"3) Optionally specify k for SelectKBest feature selection (set to 0 to skip)."
|
107 |
)
|
108 |
)
|
109 |
|
|
|
|
|
|
|
110 |
if __name__ == "__main__":
|
111 |
iface.launch()
|