LeonceNsh commited on
Commit
823fa29
·
verified ·
1 Parent(s): e102d88

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +195 -0
app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import seaborn as sns
6
+
7
+ from sklearn.impute import SimpleImputer
8
+ from sklearn.decomposition import PCA
9
+ from sklearn.model_selection import train_test_split
10
+ from sklearn.metrics import mean_squared_error, r2_score
11
+
12
+ from sklearn.ensemble import RandomForestRegressor
13
+ from xgboost import XGBRegressor
14
+ from sklearn.linear_model import Ridge
15
+ from catboost import CatBoostRegressor
16
+
17
+ import warnings
18
+ warnings.filterwarnings('ignore')
19
+
20
+ def load_embeddings(embeddings_file_path):
21
+ county_embeddings = pd.read_csv(embeddings_file_path).set_index('place')
22
+ numeric_cols = county_embeddings.select_dtypes(include=['number']).columns
23
+ county_embeddings_numeric = county_embeddings[numeric_cols]
24
+ imputer = SimpleImputer(strategy='mean')
25
+ county_embeddings_imputed = imputer.fit_transform(county_embeddings_numeric)
26
+ pca = PCA(n_components=330)
27
+ pca.fit(county_embeddings_imputed)
28
+ county_embeddings_pca = pca.transform(county_embeddings_imputed)
29
+ return county_embeddings, county_embeddings_pca, pca, imputer
30
+
31
+ def load_unemployment_data(unemployment_file_path):
32
+ unemployment_data = pd.read_csv(unemployment_file_path).set_index('place')
33
+ unemployment_long = unemployment_data.reset_index().melt(id_vars='place', var_name='date', value_name='unemployment_rate')
34
+ return unemployment_long
35
+
36
+ def preprocess_data(county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer):
37
+ # Prepare data for modeling
38
+ X = unemployment_long.drop('unemployment_rate', axis=1)
39
+ y = unemployment_long['unemployment_rate']
40
+
41
+ # Split the data
42
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
43
+
44
+ # Merge embeddings
45
+ county_embeddings.index = county_embeddings.index.astype(str)
46
+ X_train['place'] = X_train['place'].astype(str)
47
+ X_test['place'] = X_test['place'].astype(str)
48
+
49
+ X_train = X_train.merge(county_embeddings, left_on='place', right_index=True, how='left')
50
+ X_test = X_test.merge(county_embeddings, left_on='place', right_index=True, how='left')
51
+
52
+ # Remove non-numeric columns
53
+ numeric_cols_train = X_train.select_dtypes(include=['number']).columns
54
+ X_train_numeric = X_train[numeric_cols_train]
55
+ numeric_cols_test = X_test.select_dtypes(include=['number']).columns
56
+ X_test_numeric = X_test[numeric_cols_test]
57
+
58
+ # Impute missing values
59
+ X_train_imputed = imputer.transform(X_train_numeric)
60
+ X_test_imputed = imputer.transform(X_test_numeric)
61
+
62
+ # Apply PCA
63
+ X_train_pca = pca.transform(X_train_imputed)
64
+ X_test_pca = pca.transform(X_test_imputed)
65
+
66
+ return X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train
67
+
68
+ def train_and_evaluate_models(X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train):
69
+ # Define models
70
+ models = {
71
+ "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
72
+ "XGBoost": XGBRegressor(n_estimators=100, random_state=42, tree_method='gpu_hist'),
73
+ "Ridge Regression": Ridge(alpha=1.0),
74
+ "CatBoost": CatBoostRegressor(iterations=100, random_seed=42, task_type="GPU")
75
+ }
76
+
77
+ results = {}
78
+ feature_importances = {}
79
+
80
+ for name, model in models.items():
81
+ model.fit(X_train_pca, y_train)
82
+ y_pred = model.predict(X_test_pca)
83
+ rmse = np.sqrt(mean_squared_error(y_test, y_pred))
84
+ r2 = r2_score(y_test, y_pred)
85
+ results[name] = {'RMSE': rmse, 'R-squared': r2}
86
+
87
+ # Feature importances
88
+ if hasattr(model, 'feature_importances_'):
89
+ importances = model.feature_importances_
90
+ feature_importances[name] = importances
91
+
92
+ return results, feature_importances, numeric_cols_train
93
+
94
+ def plot_feature_importance(importances, feature_names, model_name):
95
+ feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
96
+ feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False).head(20)
97
+
98
+ plt.figure(figsize=(10, 8))
99
+ sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
100
+ plt.title(f'{model_name} Feature Importance')
101
+ plt.tight_layout()
102
+ plt.close()
103
+ return plt.gcf()
104
+
105
+ def plot_metrics(results):
106
+ metrics_df = pd.DataFrame(results).T.reset_index().rename(columns={'index': 'Model'})
107
+
108
+ plt.figure(figsize=(8, 6))
109
+ sns.barplot(x='Model', y='RMSE', data=metrics_df)
110
+ plt.title('RMSE for Each Model')
111
+ plt.xticks(rotation=45)
112
+ plt.tight_layout()
113
+ plt.close()
114
+ rmse_plot = plt.gcf()
115
+
116
+ plt.figure(figsize=(8, 6))
117
+ sns.barplot(x='Model', y='R-squared', data=metrics_df)
118
+ plt.title('R-squared for Each Model')
119
+ plt.xticks(rotation=45)
120
+ plt.tight_layout()
121
+ plt.close()
122
+ r2_plot = plt.gcf()
123
+
124
+ return rmse_plot, r2_plot
125
+
126
+ def main(embeddings_file_path, unemployment_file_path):
127
+ # Load data
128
+ county_embeddings, county_embeddings_pca, pca, imputer = load_embeddings(embeddings_file_path)
129
+ unemployment_long = load_unemployment_data(unemployment_file_path)
130
+
131
+ # Preprocess data
132
+ X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train = preprocess_data(
133
+ county_embeddings, county_embeddings_pca, unemployment_long, pca, imputer
134
+ )
135
+
136
+ # Train and evaluate models
137
+ results, feature_importances, feature_names = train_and_evaluate_models(
138
+ X_train_pca, X_test_pca, y_train, y_test, numeric_cols_train
139
+ )
140
+
141
+ # Plot metrics
142
+ rmse_plot, r2_plot = plot_metrics(results)
143
+
144
+ # Plot feature importance for models that have it
145
+ feature_importance_plots = {}
146
+ for model_name, importances in feature_importances.items():
147
+ fig = plot_feature_importance(importances, [f'PC{i+1}' for i in range(len(importances))], model_name)
148
+ feature_importance_plots[model_name] = fig
149
+
150
+ return results, rmse_plot, r2_plot, feature_importance_plots
151
+
152
+ def gradio_app():
153
+ with gr.Blocks() as demo:
154
+ gr.Markdown("# County-Level Unemployment Rate Forecasting")
155
+ gr.Markdown("Upload county embeddings and unemployment data to train models and visualize results.")
156
+
157
+ with gr.Row():
158
+ embeddings_file = gr.File(label="Upload County Embeddings CSV")
159
+ unemployment_file = gr.File(label="Upload Unemployment Data CSV")
160
+
161
+ run_button = gr.Button("Run Analysis")
162
+
163
+ output_results = gr.JSON(label="Model Performance Metrics")
164
+ output_rmse_plot = gr.Plot(label="RMSE Comparison")
165
+ output_r2_plot = gr.Plot(label="R-squared Comparison")
166
+ output_feature_importance = gr.Plot(label="Feature Importances")
167
+
168
+ def run_analysis(embeddings_file, unemployment_file):
169
+ if embeddings_file is None or unemployment_file is None:
170
+ return gr.update(value="Please upload both embeddings and unemployment data files."), None, None, None
171
+
172
+ # Read files
173
+ embeddings_file_path = embeddings_file.name
174
+ unemployment_file_path = unemployment_file.name
175
+
176
+ # Run main analysis
177
+ results, rmse_plot, r2_plot, feature_importance_plots = main(embeddings_file_path, unemployment_file_path)
178
+
179
+ # For simplicity, display feature importance of Random Forest (if available)
180
+ fi_plot = None
181
+ if 'Random Forest' in feature_importance_plots:
182
+ fi_plot = feature_importance_plots['Random Forest']
183
+
184
+ return results, rmse_plot, r2_plot, fi_plot
185
+
186
+ run_button.click(
187
+ run_analysis,
188
+ inputs=[embeddings_file, unemployment_file],
189
+ outputs=[output_results, output_rmse_plot, output_r2_plot, output_feature_importance]
190
+ )
191
+
192
+ demo.launch()
193
+
194
+ if __name__ == "__main__":
195
+ gradio_app()