Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
import seaborn as sns
|
6 |
+
|
7 |
+
from sklearn.impute import SimpleImputer
|
8 |
+
from sklearn.decomposition import PCA
|
9 |
+
from sklearn.model_selection import train_test_split
|
10 |
+
from sklearn.metrics import mean_squared_error, r2_score
|
11 |
+
|
12 |
+
from sklearn.ensemble import RandomForestRegressor
|
13 |
+
from xgboost import XGBRegressor
|
14 |
+
from sklearn.linear_model import Ridge
|
15 |
+
from catboost import CatBoostRegressor
|
16 |
+
|
17 |
+
import warnings
|
18 |
+
warnings.filterwarnings('ignore')
|
19 |
+
|
20 |
+
def load_embeddings(embeddings_file_path):
|
21 |
+
county_embeddings = pd.read_csv(embeddings_file_path).set_index('place')
|
22 |
+
numeric_cols = county_embeddings.select_dtypes(include=['number']).columns
|
23 |
+
county_embeddings_numeric = county_embeddings[numeric_cols]
|
24 |
+
imputer = SimpleImputer(strategy='mean')
|
25 |
+
county_embeddings_imputed = imputer.fit_transform(county_embeddings_numeric)
|
26 |
+
pca = PCA(n_components=330)
|
27 |
+
pca.fit(county_embeddings_imputed)
|
28 |
+
county_embeddings_pca = pca.transform(county_embeddings_imputed)
|
29 |
+
return county_embeddings, county_embeddings_pca, pca, imputer
|
30 |
+
|
31 |
+
def load_unemployment_data(unemployment_file_path):
|
32 |
+
unemployment_data = pd.read_csv(unemployment_file_path).set_index('place')
|
33 |
+
unemployment_long = unemployment_data.reset_index().melt(id_vars='place', var_name='date', value_name='unemployment_rate')
|
34 |
+
return unemployment_long
|
35 |
+
|
36 |
+
def preprocess_data(county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer):
|
37 |
+
# Prepare data for modeling
|
38 |
+
X = unemployment_long.drop('unemployment_rate', axis=1)
|
39 |
+
y = unemployment_long['unemployment_rate']
|
40 |
+
|
41 |
+
# Split the data
|
42 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
|
43 |
+
|
44 |
+
# Merge embeddings
|
45 |
+
county_embeddings.index = county_embeddings.index.astype(str)
|
46 |
+
X_train['place'] = X_train['place'].astype(str)
|
47 |
+
X_test['place'] = X_test['place'].astype(str)
|
48 |
+
|
49 |
+
X_train = X_train.merge(county_embeddings, left_on='place', right_index=True, how='left')
|
50 |
+
X_test = X_test.merge(county_embeddings, left_on='place', right_index=True, how='left')
|
51 |
+
|
52 |
+
# Remove non-numeric columns
|
53 |
+
numeric_cols_train = X_train.select_dtypes(include=['number']).columns
|
54 |
+
X_train_numeric = X_train[numeric_cols_train]
|
55 |
+
numeric_cols_test = X_test.select_dtypes(include=['number']).columns
|
56 |
+
X_test_numeric = X_test[numeric_cols_test]
|
57 |
+
|
58 |
+
# Impute missing values
|
59 |
+
X_train_imputed = imputer.transform(X_train_numeric)
|
60 |
+
X_test_imputed = imputer.transform(X_test_numeric)
|
61 |
+
|
62 |
+
# Apply PCA
|
63 |
+
X_train_pca = pca.transform(X_train_imputed)
|
64 |
+
X_test_pca = pca.transform(X_test_imputed)
|
65 |
+
|
66 |
+
return X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train
|
67 |
+
|
68 |
+
def train_and_evaluate_models(X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train):
|
69 |
+
# Define models
|
70 |
+
models = {
|
71 |
+
"Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
|
72 |
+
"XGBoost": XGBRegressor(n_estimators=100, random_state=42, tree_method='gpu_hist'),
|
73 |
+
"Ridge Regression": Ridge(alpha=1.0),
|
74 |
+
"CatBoost": CatBoostRegressor(iterations=100, random_seed=42, task_type="GPU")
|
75 |
+
}
|
76 |
+
|
77 |
+
results = {}
|
78 |
+
feature_importances = {}
|
79 |
+
|
80 |
+
for name, model in models.items():
|
81 |
+
model.fit(X_train_pca, y_train)
|
82 |
+
y_pred = model.predict(X_test_pca)
|
83 |
+
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
|
84 |
+
r2 = r2_score(y_test, y_pred)
|
85 |
+
results[name] = {'RMSE': rmse, 'R-squared': r2}
|
86 |
+
|
87 |
+
# Feature importances
|
88 |
+
if hasattr(model, 'feature_importances_'):
|
89 |
+
importances = model.feature_importances_
|
90 |
+
feature_importances[name] = importances
|
91 |
+
|
92 |
+
return results, feature_importances, numeric_cols_train
|
93 |
+
|
94 |
+
def plot_feature_importance(importances, feature_names, model_name):
|
95 |
+
feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
|
96 |
+
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False).head(20)
|
97 |
+
|
98 |
+
plt.figure(figsize=(10, 8))
|
99 |
+
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
|
100 |
+
plt.title(f'{model_name} Feature Importance')
|
101 |
+
plt.tight_layout()
|
102 |
+
plt.close()
|
103 |
+
return plt.gcf()
|
104 |
+
|
105 |
+
def plot_metrics(results):
|
106 |
+
metrics_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})
|
107 |
+
|
108 |
+
plt.figure(figsize=(8, 6))
|
109 |
+
sns.barplot(x='Model', y='RMSE', data=metrics_df)
|
110 |
+
plt.title('RMSE for Each Model')
|
111 |
+
plt.xticks(rotation=45)
|
112 |
+
plt.tight_layout()
|
113 |
+
plt.close()
|
114 |
+
rmse_plot = plt.gcf()
|
115 |
+
|
116 |
+
plt.figure(figsize=(8, 6))
|
117 |
+
sns.barplot(x='Model', y='R-squared', data=metrics_df)
|
118 |
+
plt.title('R-squared for Each Model')
|
119 |
+
plt.xticks(rotation=45)
|
120 |
+
plt.tight_layout()
|
121 |
+
plt.close()
|
122 |
+
r2_plot = plt.gcf()
|
123 |
+
|
124 |
+
return rmse_plot, r2_plot
|
125 |
+
|
126 |
+
def main(embeddings_file_path, unemployment_file_path):
|
127 |
+
# Load data
|
128 |
+
county_embeddings, county_embeddings_pca, pca, imputer = load_embeddings(embeddings_file_path)
|
129 |
+
unemployment_long = load_unemployment_data(unemployment_file_path)
|
130 |
+
|
131 |
+
# Preprocess data
|
132 |
+
X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train = preprocess_data(
|
133 |
+
county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer
|
134 |
+
)
|
135 |
+
|
136 |
+
# Train and evaluate models
|
137 |
+
results, feature_importances, feature_names = train_and_evaluate_models(
|
138 |
+
X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train
|
139 |
+
)
|
140 |
+
|
141 |
+
# Plot metrics
|
142 |
+
rmse_plot, r2_plot = plot_metrics(results)
|
143 |
+
|
144 |
+
# Plot feature importance for models that have it
|
145 |
+
feature_importance_plots = {}
|
146 |
+
for model_name, importances in feature_importances.items():
|
147 |
+
fig = plot_feature_importance(importances, [f'PC{i+1}' for i in range(len(importances))], model_name)
|
148 |
+
feature_importance_plots[model_name] = fig
|
149 |
+
|
150 |
+
return results, rmse_plot, r2_plot, feature_importance_plots
|
151 |
+
|
152 |
+
def gradio_app():
|
153 |
+
with gr.Blocks() as demo:
|
154 |
+
gr.Markdown("# County-Level Unemployment Rate Forecasting")
|
155 |
+
gr.Markdown("Upload county embeddings and unemployment data to train models and visualize results.")
|
156 |
+
|
157 |
+
with gr.Row():
|
158 |
+
embeddings_file = gr.File(label="Upload County Embeddings CSV")
|
159 |
+
unemployment_file = gr.File(label="Upload Unemployment Data CSV")
|
160 |
+
|
161 |
+
run_button = gr.Button("Run Analysis")
|
162 |
+
|
163 |
+
output_results = gr.JSON(label="Model Performance Metrics")
|
164 |
+
output_rmse_plot = gr.Plot(label="RMSE Comparison")
|
165 |
+
output_r2_plot = gr.Plot(label="R-squared Comparison")
|
166 |
+
output_feature_importance = gr.Plot(label="Feature Importances")
|
167 |
+
|
168 |
+
def run_analysis(embeddings_file, unemployment_file):
|
169 |
+
if embeddings_file is None or unemployment_file is None:
|
170 |
+
return gr.update(value="Please upload both embeddings and unemployment data files."), None, None, None
|
171 |
+
|
172 |
+
# Read files
|
173 |
+
embeddings_file_path = embeddings_file.name
|
174 |
+
unemployment_file_path = unemployment_file.name
|
175 |
+
|
176 |
+
# Run main analysis
|
177 |
+
results, rmse_plot, r2_plot, feature_importance_plots = main(embeddings_file_path, unemployment_file_path)
|
178 |
+
|
179 |
+
# For simplicity, display feature importance of Random Forest (if available)
|
180 |
+
fi_plot = None
|
181 |
+
if 'Random Forest' in feature_importance_plots:
|
182 |
+
fi_plot = feature_importance_plots['Random Forest']
|
183 |
+
|
184 |
+
return results, rmse_plot, r2_plot, fi_plot
|
185 |
+
|
186 |
+
run_button.click(
|
187 |
+
run_analysis,
|
188 |
+
inputs=[embeddings_file, unemployment_file],
|
189 |
+
outputs=[output_results, output_rmse_plot, output_r2_plot, output_feature_importance]
|
190 |
+
)
|
191 |
+
|
192 |
+
demo.launch()
|
193 |
+
|
194 |
+
if __name__ == "__main__":
|
195 |
+
gradio_app()
|