Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -4,101 +4,151 @@ import gradio as gr
|
|
4 |
import matplotlib.pyplot as plt
|
5 |
import seaborn as sns
|
6 |
import numpy as np
|
|
|
7 |
|
8 |
# ========================
|
9 |
# Data Loading
|
10 |
# ========================
|
11 |
-
|
12 |
-
# Load the health and demographic data
|
13 |
conus_data = pd.read_csv("conus27.csv")
|
14 |
-
# Load the county shapefile
|
15 |
county_geojson = gpd.read_file("county.geojson")
|
16 |
-
# Load the county embeddings
|
17 |
county_embeddings = pd.read_csv("county_embeddings.csv")
|
18 |
-
# Load the unemployment data
|
19 |
county_unemployment = pd.read_csv("county_unemployment.csv")
|
20 |
-
# Load the poverty data
|
21 |
zcta_poverty = pd.read_csv("zcta_poverty.csv")
|
22 |
-
# Load the ZCTA shapefile
|
23 |
zcta_geojson = gpd.read_file("zcta.geojson")
|
24 |
|
25 |
-
#
|
26 |
-
county_unemployment_melted = county_unemployment.melt(
|
27 |
-
|
28 |
-
|
29 |
county_unemployment_melted['place'] = county_unemployment_melted['place'].astype(str)
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
|
34 |
# Prepare poverty data
|
35 |
-
zcta_poverty_melted = zcta_poverty.melt(
|
36 |
-
|
37 |
-
|
38 |
zcta_poverty_melted['place'] = zcta_poverty_melted['place'].astype(str)
|
39 |
-
|
40 |
-
|
41 |
zcta_geojson['place'] = zcta_geojson['place'].astype(str)
|
|
|
|
|
|
|
42 |
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# List of health metrics available
|
47 |
health_metrics = [col for col in conus_data.columns if col.startswith('Percent_Person_')]
|
48 |
-
# Simplify metric names
|
49 |
simplified_metrics = [col.replace('Percent_Person_', '') for col in health_metrics]
|
50 |
metric_mapping = dict(zip(simplified_metrics, health_metrics))
|
51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
52 |
# ========================
|
53 |
# Utility Functions
|
54 |
# ========================
|
55 |
|
56 |
def plot_health_metric(metric):
|
57 |
"""
|
58 |
-
Plots the geographical distribution of a selected health metric.
|
59 |
"""
|
60 |
-
|
61 |
metric_full_name = metric_mapping[metric]
|
62 |
-
|
63 |
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
|
64 |
-
|
65 |
column=metric_full_name,
|
66 |
-
cmap='
|
67 |
markersize=50,
|
68 |
legend=True,
|
69 |
legend_kwds={'label': f"{metric} (%)"},
|
70 |
ax=ax,
|
71 |
alpha=0.7,
|
72 |
-
edgecolor='
|
|
|
|
|
73 |
)
|
74 |
ax.set_title(f'Geographical Distribution of {metric}', fontsize=15)
|
75 |
ax.axis('off')
|
76 |
plt.tight_layout()
|
77 |
return fig
|
78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def plot_correlation_matrix(selected_metrics):
|
80 |
"""
|
81 |
-
Plots
|
82 |
"""
|
83 |
-
selected_columns = [metric_mapping[
|
84 |
corr = conus_data[selected_columns].corr()
|
|
|
|
|
|
|
|
|
|
|
|
|
85 |
fig, ax = plt.subplots(figsize=(10, 8))
|
86 |
-
sns.heatmap(
|
87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
plt.tight_layout()
|
89 |
return fig
|
90 |
|
91 |
def plot_unemployment_map(date):
|
92 |
"""
|
93 |
-
Plots the unemployment rate map for a selected date.
|
94 |
"""
|
95 |
date = str(date)
|
96 |
data = county_geojson_unemployment[county_geojson_unemployment['date'] == date]
|
97 |
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
|
98 |
data.plot(
|
99 |
column='unemployment_rate',
|
100 |
-
cmap='
|
101 |
-
linewidth=0.
|
102 |
ax=ax,
|
103 |
edgecolor='0.8',
|
104 |
legend=True,
|
@@ -111,15 +161,15 @@ def plot_unemployment_map(date):
|
|
111 |
|
112 |
def plot_poverty_map(year):
|
113 |
"""
|
114 |
-
Plots the poverty rate map for a selected year.
|
115 |
"""
|
116 |
year = str(year)
|
117 |
data = zcta_geojson_poverty[zcta_geojson_poverty['year'] == year]
|
118 |
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
|
119 |
data.plot(
|
120 |
column='poverty_rate',
|
121 |
-
cmap='
|
122 |
-
linewidth=0.
|
123 |
ax=ax,
|
124 |
edgecolor='0.8',
|
125 |
legend=True,
|
@@ -130,25 +180,20 @@ def plot_poverty_map(year):
|
|
130 |
plt.tight_layout()
|
131 |
return fig
|
132 |
|
133 |
-
def summarize_health_metrics(metric):
|
134 |
-
"""
|
135 |
-
Generates summary statistics for a selected health metric.
|
136 |
-
"""
|
137 |
-
metric_full_name = metric_mapping[metric]
|
138 |
-
summary = conus_data[metric_full_name].describe().to_frame().reset_index()
|
139 |
-
summary.columns = ['Statistic', 'Value']
|
140 |
-
return summary
|
141 |
-
|
142 |
# ========================
|
143 |
# Gradio Interface Functions
|
144 |
# ========================
|
145 |
|
146 |
def health_metric_interface(metric):
|
147 |
-
|
148 |
summary = summarize_health_metrics(metric)
|
149 |
-
|
|
|
150 |
|
151 |
def correlation_interface(metrics):
|
|
|
|
|
|
|
152 |
fig = plot_correlation_matrix(metrics)
|
153 |
return fig
|
154 |
|
@@ -166,25 +211,46 @@ def poverty_interface(year):
|
|
166 |
|
167 |
with gr.Blocks(title="US Population Health Dashboard") as demo:
|
168 |
gr.Markdown("# US Population Health Dashboard")
|
169 |
-
gr.Markdown("
|
170 |
-
|
171 |
-
|
172 |
-
|
|
|
|
|
|
|
|
|
173 |
health_metric = gr.Dropdown(label="Select a Health Metric", choices=simplified_metrics, value=simplified_metrics[0])
|
174 |
-
health_plot = gr.Plot()
|
175 |
-
health_summary = gr.Dataframe(headers=["Statistic", "Value"])
|
176 |
-
|
|
|
177 |
|
178 |
with gr.Tab("Health Metrics Correlation"):
|
179 |
-
gr.Markdown("### Correlation
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
182 |
correlation_metrics.change(correlation_interface, inputs=correlation_metrics, outputs=correlation_plot)
|
183 |
|
184 |
-
|
185 |
-
|
186 |
-
|
187 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
|
189 |
if __name__ == "__main__":
|
190 |
-
demo.launch()
|
|
|
4 |
import matplotlib.pyplot as plt
|
5 |
import seaborn as sns
|
6 |
import numpy as np
|
7 |
+
from scipy.cluster.hierarchy import linkage, leaves_list
|
8 |
|
9 |
# ========================
|
10 |
# Data Loading
|
11 |
# ========================
|
|
|
|
|
12 |
conus_data = pd.read_csv("conus27.csv")
|
|
|
13 |
county_geojson = gpd.read_file("county.geojson")
|
|
|
14 |
county_embeddings = pd.read_csv("county_embeddings.csv")
|
|
|
15 |
county_unemployment = pd.read_csv("county_unemployment.csv")
|
|
|
16 |
zcta_poverty = pd.read_csv("zcta_poverty.csv")
|
|
|
17 |
zcta_geojson = gpd.read_file("zcta.geojson")
|
18 |
|
19 |
+
# Prepare unemployment data
|
20 |
+
county_unemployment_melted = county_unemployment.melt(
|
21 |
+
id_vars=['place'], var_name='date', value_name='unemployment_rate'
|
22 |
+
)
|
23 |
county_unemployment_melted['place'] = county_unemployment_melted['place'].astype(str)
|
24 |
+
county_geojson_unemployment = county_geojson.merge(
|
25 |
+
county_unemployment_melted, left_on='place', right_on='place', how='left'
|
26 |
+
)
|
27 |
|
28 |
# Prepare poverty data
|
29 |
+
zcta_poverty_melted = zcta_poverty.melt(
|
30 |
+
id_vars=['place'], var_name='year', value_name='poverty_rate'
|
31 |
+
)
|
32 |
zcta_poverty_melted['place'] = zcta_poverty_melted['place'].astype(str)
|
|
|
|
|
33 |
zcta_geojson['place'] = zcta_geojson['place'].astype(str)
|
34 |
+
zcta_geojson_poverty = zcta_geojson.merge(
|
35 |
+
zcta_poverty_melted, left_on='place', right_on='place', how='left'
|
36 |
+
)
|
37 |
|
38 |
+
# Identify health metrics
|
|
|
|
|
|
|
39 |
health_metrics = [col for col in conus_data.columns if col.startswith('Percent_Person_')]
|
|
|
40 |
simplified_metrics = [col.replace('Percent_Person_', '') for col in health_metrics]
|
41 |
metric_mapping = dict(zip(simplified_metrics, health_metrics))
|
42 |
|
43 |
+
# Create a merged geodataframe for health metrics visualization
|
44 |
+
# Assuming conus_data has a 'place' or 'GEOID' matching the county_geojson
|
45 |
+
if 'place' in conus_data.columns:
|
46 |
+
merged_health = county_geojson.merge(conus_data, on='place', how='left')
|
47 |
+
else:
|
48 |
+
# If another key needed, adjust here. Assuming 'GEOID' would match, as example.
|
49 |
+
if 'GEOID' in county_geojson.columns and 'GEOID' in conus_data.columns:
|
50 |
+
merged_health = county_geojson.merge(conus_data, on='GEOID', how='left')
|
51 |
+
else:
|
52 |
+
raise ValueError("No matching key found to merge health data with geodata.")
|
53 |
+
|
54 |
# ========================
|
55 |
# Utility Functions
|
56 |
# ========================
|
57 |
|
58 |
def plot_health_metric(metric):
|
59 |
"""
|
60 |
+
Plots the geographical distribution of a selected health metric using a better colormap.
|
61 |
"""
|
|
|
62 |
metric_full_name = metric_mapping[metric]
|
|
|
63 |
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
|
64 |
+
merged_health.plot(
|
65 |
column=metric_full_name,
|
66 |
+
cmap='viridis',
|
67 |
markersize=50,
|
68 |
legend=True,
|
69 |
legend_kwds={'label': f"{metric} (%)"},
|
70 |
ax=ax,
|
71 |
alpha=0.7,
|
72 |
+
edgecolor='black',
|
73 |
+
linewidth=0.5,
|
74 |
+
missing_kwds={"color": "lightgrey", "label": "No Data"}
|
75 |
)
|
76 |
ax.set_title(f'Geographical Distribution of {metric}', fontsize=15)
|
77 |
ax.axis('off')
|
78 |
plt.tight_layout()
|
79 |
return fig
|
80 |
|
81 |
+
def plot_health_histogram(metric):
|
82 |
+
"""
|
83 |
+
Plots the distribution (histogram) of a selected health metric to understand its spread.
|
84 |
+
"""
|
85 |
+
metric_full_name = metric_mapping[metric]
|
86 |
+
data = conus_data[metric_full_name].dropna()
|
87 |
+
fig, ax = plt.subplots(figsize=(8, 6))
|
88 |
+
sns.histplot(data, kde=True, color='teal', ax=ax)
|
89 |
+
ax.set_title(f'Distribution of {metric} (%)', fontsize=15)
|
90 |
+
ax.set_xlabel(f'{metric} (%)')
|
91 |
+
ax.set_ylabel('Count')
|
92 |
+
plt.tight_layout()
|
93 |
+
return fig
|
94 |
+
|
95 |
+
def summarize_health_metrics(metric):
|
96 |
+
"""
|
97 |
+
Generates more detailed summary statistics for a selected health metric.
|
98 |
+
Includes median and IQR along with standard describe().
|
99 |
+
"""
|
100 |
+
metric_full_name = metric_mapping[metric]
|
101 |
+
data = conus_data[metric_full_name].dropna()
|
102 |
+
desc = data.describe().to_frame().reset_index()
|
103 |
+
desc.columns = ['Statistic', 'Value']
|
104 |
+
|
105 |
+
# Add median and IQR if not already present
|
106 |
+
median_val = data.median()
|
107 |
+
q1, q3 = data.quantile([0.25, 0.75])
|
108 |
+
iqr = q3 - q1
|
109 |
+
# Insert median and IQR below mean row
|
110 |
+
extra_stats = pd.DataFrame({
|
111 |
+
'Statistic': ['Median', 'IQR'],
|
112 |
+
'Value': [median_val, iqr]
|
113 |
+
})
|
114 |
+
summary = pd.concat([desc, extra_stats], ignore_index=True)
|
115 |
+
return summary
|
116 |
+
|
117 |
def plot_correlation_matrix(selected_metrics):
|
118 |
"""
|
119 |
+
Plots a correlation matrix for selected health metrics and reorders the axes using hierarchical clustering.
|
120 |
"""
|
121 |
+
selected_columns = [metric_mapping[m] for m in selected_metrics]
|
122 |
corr = conus_data[selected_columns].corr()
|
123 |
+
|
124 |
+
# Hierarchical clustering to reorder correlation matrix
|
125 |
+
linkage_matrix = linkage(1 - corr, method='average')
|
126 |
+
idx = leaves_list(linkage_matrix)
|
127 |
+
corr = corr.iloc[idx, :].iloc[:, idx]
|
128 |
+
|
129 |
fig, ax = plt.subplots(figsize=(10, 8))
|
130 |
+
sns.heatmap(
|
131 |
+
corr, annot=True, cmap='coolwarm', square=True, ax=ax,
|
132 |
+
xticklabels=corr.columns, yticklabels=corr.columns,
|
133 |
+
cbar_kws={"shrink": .8}
|
134 |
+
)
|
135 |
+
ax.set_title('Correlation Matrix (Hierarchically Clustered)', fontsize=15)
|
136 |
+
plt.xticks(rotation=45, ha='right')
|
137 |
+
plt.yticks(rotation=0)
|
138 |
plt.tight_layout()
|
139 |
return fig
|
140 |
|
141 |
def plot_unemployment_map(date):
|
142 |
"""
|
143 |
+
Plots the unemployment rate map for a selected date with an improved colormap.
|
144 |
"""
|
145 |
date = str(date)
|
146 |
data = county_geojson_unemployment[county_geojson_unemployment['date'] == date]
|
147 |
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
|
148 |
data.plot(
|
149 |
column='unemployment_rate',
|
150 |
+
cmap='YlGnBu',
|
151 |
+
linewidth=0.5,
|
152 |
ax=ax,
|
153 |
edgecolor='0.8',
|
154 |
legend=True,
|
|
|
161 |
|
162 |
def plot_poverty_map(year):
|
163 |
"""
|
164 |
+
Plots the poverty rate map for a selected year with improved colormap.
|
165 |
"""
|
166 |
year = str(year)
|
167 |
data = zcta_geojson_poverty[zcta_geojson_poverty['year'] == year]
|
168 |
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
|
169 |
data.plot(
|
170 |
column='poverty_rate',
|
171 |
+
cmap='YlOrRd',
|
172 |
+
linewidth=0.5,
|
173 |
ax=ax,
|
174 |
edgecolor='0.8',
|
175 |
legend=True,
|
|
|
180 |
plt.tight_layout()
|
181 |
return fig
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
# ========================
|
184 |
# Gradio Interface Functions
|
185 |
# ========================
|
186 |
|
187 |
def health_metric_interface(metric):
|
188 |
+
map_fig = plot_health_metric(metric)
|
189 |
summary = summarize_health_metrics(metric)
|
190 |
+
hist_fig = plot_health_histogram(metric)
|
191 |
+
return map_fig, summary, hist_fig
|
192 |
|
193 |
def correlation_interface(metrics):
|
194 |
+
# Require at least two metrics to show correlation
|
195 |
+
if len(metrics) < 2:
|
196 |
+
return "Please select at least two metrics to see a correlation matrix."
|
197 |
fig = plot_correlation_matrix(metrics)
|
198 |
return fig
|
199 |
|
|
|
211 |
|
212 |
with gr.Blocks(title="US Population Health Dashboard") as demo:
|
213 |
gr.Markdown("# US Population Health Dashboard")
|
214 |
+
gr.Markdown("""
|
215 |
+
Explore health metrics, socioeconomic data, and their geospatial distributions across the United States.
|
216 |
+
Use the tabs below to select different datasets and visualizations.
|
217 |
+
""")
|
218 |
+
|
219 |
+
with gr.Tab("Health Metrics"):
|
220 |
+
gr.Markdown("### Explore a Selected Health Metric")
|
221 |
+
gr.Markdown("Select a health metric to view its geographical distribution, summary statistics, and distribution histogram.")
|
222 |
health_metric = gr.Dropdown(label="Select a Health Metric", choices=simplified_metrics, value=simplified_metrics[0])
|
223 |
+
health_plot = gr.Plot(label="Health Metric Map")
|
224 |
+
health_summary = gr.Dataframe(label="Summary Statistics", headers=["Statistic", "Value"])
|
225 |
+
health_hist = gr.Plot(label="Metric Distribution Histogram")
|
226 |
+
health_metric.change(health_metric_interface, inputs=health_metric, outputs=[health_plot, health_summary, health_hist])
|
227 |
|
228 |
with gr.Tab("Health Metrics Correlation"):
|
229 |
+
gr.Markdown("### Correlation Between Health Metrics")
|
230 |
+
gr.Markdown("Select multiple health metrics to see how they correlate with each other. The matrix is reordered using hierarchical clustering.")
|
231 |
+
correlation_metrics = gr.CheckboxGroup(
|
232 |
+
label="Select Health Metrics",
|
233 |
+
choices=simplified_metrics,
|
234 |
+
value=simplified_metrics[:5]
|
235 |
+
)
|
236 |
+
correlation_plot = gr.Plot(label="Correlation Matrix")
|
237 |
correlation_metrics.change(correlation_interface, inputs=correlation_metrics, outputs=correlation_plot)
|
238 |
|
239 |
+
with gr.Tab("Unemployment Rates Over Time"):
|
240 |
+
gr.Markdown("### View Unemployment Rates by County")
|
241 |
+
gr.Markdown("Select a date to see the unemployment rate distribution across counties.")
|
242 |
+
unique_dates = sorted(county_unemployment_melted['date'].unique())
|
243 |
+
unemployment_date = gr.Dropdown(label="Select a Date", choices=unique_dates, value=unique_dates[0])
|
244 |
+
unemployment_plot = gr.Plot(label="Unemployment Rate Map")
|
245 |
+
unemployment_date.change(unemployment_interface, inputs=unemployment_date, outputs=unemployment_plot)
|
246 |
+
|
247 |
+
with gr.Tab("Poverty Rates Over Time"):
|
248 |
+
gr.Markdown("### View Poverty Rates by ZCTA")
|
249 |
+
gr.Markdown("Select a year to see the poverty rate distribution across ZIP Code Tabulation Areas.")
|
250 |
+
unique_years = sorted(zcta_poverty_melted['year'].unique())
|
251 |
+
poverty_year = gr.Dropdown(label="Select a Year", choices=unique_years, value=unique_years[0])
|
252 |
+
poverty_plot = gr.Plot(label="Poverty Rate Map")
|
253 |
+
poverty_year.change(poverty_interface, inputs=poverty_year, outputs=poverty_plot)
|
254 |
|
255 |
if __name__ == "__main__":
|
256 |
+
demo.launch()
|