Spaces:
Sleeping
Sleeping
import pandas as pd | |
import geopandas as gpd | |
import gradio as gr | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
import numpy as np | |
from scipy.cluster.hierarchy import linkage, leaves_list | |
# ======================== | |
# Data Loading | |
# ======================== | |
conus_data = pd.read_csv("conus27.csv") | |
county_geojson = gpd.read_file("county.geojson") | |
county_embeddings = pd.read_csv("county_embeddings.csv") | |
county_unemployment = pd.read_csv("county_unemployment.csv") | |
zcta_poverty = pd.read_csv("zcta_poverty.csv") | |
zcta_geojson = gpd.read_file("zcta.geojson") | |
# Prepare unemployment data | |
county_unemployment_melted = county_unemployment.melt( | |
id_vars=['place'], var_name='date', value_name='unemployment_rate' | |
) | |
county_unemployment_melted['place'] = county_unemployment_melted['place'].astype(str) | |
county_geojson_unemployment = county_geojson.merge( | |
county_unemployment_melted, left_on='place', right_on='place', how='left' | |
) | |
# Prepare poverty data | |
zcta_poverty_melted = zcta_poverty.melt( | |
id_vars=['place'], var_name='year', value_name='poverty_rate' | |
) | |
zcta_poverty_melted['place'] = zcta_poverty_melted['place'].astype(str) | |
zcta_geojson['place'] = zcta_geojson['place'].astype(str) | |
zcta_geojson_poverty = zcta_geojson.merge( | |
zcta_poverty_melted, left_on='place', right_on='place', how='left' | |
) | |
# Identify health metrics | |
health_metrics = [col for col in conus_data.columns if col.startswith('Percent_Person_')] | |
simplified_metrics = [col.replace('Percent_Person_', '') for col in health_metrics] | |
metric_mapping = dict(zip(simplified_metrics, health_metrics)) | |
# Create a merged geodataframe for health metrics visualization | |
# Assuming conus_data has a 'place' or 'GEOID' matching the county_geojson | |
if 'place' in conus_data.columns: | |
merged_health = county_geojson.merge(conus_data, on='place', how='left') | |
else: | |
# If another key needed, adjust here. Assuming 'GEOID' would match, as example. | |
if 'GEOID' in county_geojson.columns and 'GEOID' in conus_data.columns: | |
merged_health = county_geojson.merge(conus_data, on='GEOID', how='left') | |
else: | |
raise ValueError("No matching key found to merge health data with geodata.") | |
# ======================== | |
# Utility Functions | |
# ======================== | |
def plot_health_metric(metric): | |
""" | |
Plots the geographical distribution of a selected health metric using a better colormap. | |
""" | |
metric_full_name = metric_mapping[metric] | |
fig, ax = plt.subplots(1, 1, figsize=(12, 8)) | |
merged_health.plot( | |
column=metric_full_name, | |
cmap='viridis', | |
markersize=50, | |
legend=True, | |
legend_kwds={'label': f"{metric} (%)"}, | |
ax=ax, | |
alpha=0.7, | |
edgecolor='black', | |
linewidth=0.5, | |
missing_kwds={"color": "lightgrey", "label": "No Data"} | |
) | |
ax.set_title(f'Geographical Distribution of {metric}', fontsize=15) | |
ax.axis('off') | |
plt.tight_layout() | |
return fig | |
def plot_health_histogram(metric): | |
""" | |
Plots the distribution (histogram) of a selected health metric to understand its spread. | |
""" | |
metric_full_name = metric_mapping[metric] | |
data = conus_data[metric_full_name].dropna() | |
fig, ax = plt.subplots(figsize=(8, 6)) | |
sns.histplot(data, kde=True, color='teal', ax=ax) | |
ax.set_title(f'Distribution of {metric} (%)', fontsize=15) | |
ax.set_xlabel(f'{metric} (%)') | |
ax.set_ylabel('Count') | |
plt.tight_layout() | |
return fig | |
def summarize_health_metrics(metric): | |
""" | |
Generates more detailed summary statistics for a selected health metric. | |
Includes median and IQR along with standard describe(). | |
""" | |
metric_full_name = metric_mapping[metric] | |
data = conus_data[metric_full_name].dropna() | |
desc = data.describe().to_frame().reset_index() | |
desc.columns = ['Statistic', 'Value'] | |
# Add median and IQR if not already present | |
median_val = data.median() | |
q1, q3 = data.quantile([0.25, 0.75]) | |
iqr = q3 - q1 | |
# Insert median and IQR below mean row | |
extra_stats = pd.DataFrame({ | |
'Statistic': ['Median', 'IQR'], | |
'Value': [median_val, iqr] | |
}) | |
summary = pd.concat([desc, extra_stats], ignore_index=True) | |
return summary | |
def plot_correlation_matrix(selected_metrics): | |
""" | |
Plots a correlation matrix for selected health metrics and reorders the axes using hierarchical clustering. | |
""" | |
selected_columns = [metric_mapping[m] for m in selected_metrics] | |
corr = conus_data[selected_columns].corr() | |
# Hierarchical clustering to reorder correlation matrix | |
linkage_matrix = linkage(1 - corr, method='average') | |
idx = leaves_list(linkage_matrix) | |
corr = corr.iloc[idx, :].iloc[:, idx] | |
fig, ax = plt.subplots(figsize=(10, 8)) | |
sns.heatmap( | |
corr, annot=True, cmap='coolwarm', square=True, ax=ax, | |
xticklabels=corr.columns, yticklabels=corr.columns, | |
cbar_kws={"shrink": .8} | |
) | |
ax.set_title('Correlation Matrix (Hierarchically Clustered)', fontsize=15) | |
plt.xticks(rotation=45, ha='right') | |
plt.yticks(rotation=0) | |
plt.tight_layout() | |
return fig | |
def plot_unemployment_map(date): | |
""" | |
Plots the unemployment rate map for a selected date with an improved colormap. | |
""" | |
date = str(date) | |
data = county_geojson_unemployment[county_geojson_unemployment['date'] == date] | |
fig, ax = plt.subplots(1, 1, figsize=(12, 8)) | |
data.plot( | |
column='unemployment_rate', | |
cmap='YlGnBu', | |
linewidth=0.5, | |
ax=ax, | |
edgecolor='0.8', | |
legend=True, | |
missing_kwds={"color": "lightgrey", "label": "Missing values"}, | |
) | |
ax.set_title(f'Unemployment Rate by County ({date})', fontsize=15) | |
ax.axis('off') | |
plt.tight_layout() | |
return fig | |
def plot_poverty_map(year): | |
""" | |
Plots the poverty rate map for a selected year with improved colormap. | |
""" | |
year = str(year) | |
data = zcta_geojson_poverty[zcta_geojson_poverty['year'] == year] | |
fig, ax = plt.subplots(1, 1, figsize=(12, 8)) | |
data.plot( | |
column='poverty_rate', | |
cmap='YlOrRd', | |
linewidth=0.5, | |
ax=ax, | |
edgecolor='0.8', | |
legend=True, | |
missing_kwds={"color": "lightgrey", "label": "Missing values"}, | |
) | |
ax.set_title(f'Poverty Rate by ZCTA ({year})', fontsize=15) | |
ax.axis('off') | |
plt.tight_layout() | |
return fig | |
# ======================== | |
# Gradio Interface Functions | |
# ======================== | |
def health_metric_interface(metric): | |
map_fig = plot_health_metric(metric) | |
summary = summarize_health_metrics(metric) | |
hist_fig = plot_health_histogram(metric) | |
return map_fig, summary, hist_fig | |
def correlation_interface(metrics): | |
# Require at least two metrics to show correlation | |
if len(metrics) < 2: | |
return "Please select at least two metrics to see a correlation matrix." | |
fig = plot_correlation_matrix(metrics) | |
return fig | |
def unemployment_interface(date): | |
fig = plot_unemployment_map(date) | |
return fig | |
def poverty_interface(year): | |
fig = plot_poverty_map(year) | |
return fig | |
# ======================== | |
# Gradio App Setup | |
# ======================== | |
with gr.Blocks(title="US Population Health Dashboard") as demo: | |
gr.Markdown("# US Population Health Dashboard") | |
gr.Markdown(""" | |
Explore health metrics, socioeconomic data, and their geospatial distributions across the United States. | |
Use the tabs below to select different datasets and visualizations. | |
""") | |
with gr.Tab("Health Metrics"): | |
gr.Markdown("### Explore a Selected Health Metric") | |
gr.Markdown("Select a health metric to view its geographical distribution, summary statistics, and distribution histogram.") | |
health_metric = gr.Dropdown(label="Select a Health Metric", choices=simplified_metrics, value=simplified_metrics[0]) | |
health_plot = gr.Plot(label="Health Metric Map") | |
health_summary = gr.Dataframe(label="Summary Statistics", headers=["Statistic", "Value"]) | |
health_hist = gr.Plot(label="Metric Distribution Histogram") | |
health_metric.change(health_metric_interface, inputs=health_metric, outputs=[health_plot, health_summary, health_hist]) | |
with gr.Tab("Health Metrics Correlation"): | |
gr.Markdown("### Correlation Between Health Metrics") | |
gr.Markdown("Select multiple health metrics to see how they correlate with each other. The matrix is reordered using hierarchical clustering.") | |
correlation_metrics = gr.CheckboxGroup( | |
label="Select Health Metrics", | |
choices=simplified_metrics, | |
value=simplified_metrics[:5] | |
) | |
correlation_plot = gr.Plot(label="Correlation Matrix") | |
correlation_metrics.change(correlation_interface, inputs=correlation_metrics, outputs=correlation_plot) | |
with gr.Tab("Unemployment Rates Over Time"): | |
gr.Markdown("### View Unemployment Rates by County") | |
gr.Markdown("Select a date to see the unemployment rate distribution across counties.") | |
unique_dates = sorted(county_unemployment_melted['date'].unique()) | |
unemployment_date = gr.Dropdown(label="Select a Date", choices=unique_dates, value=unique_dates[0]) | |
unemployment_plot = gr.Plot(label="Unemployment Rate Map") | |
unemployment_date.change(unemployment_interface, inputs=unemployment_date, outputs=unemployment_plot) | |
with gr.Tab("Poverty Rates Over Time"): | |
gr.Markdown("### View Poverty Rates by ZCTA") | |
gr.Markdown("Select a year to see the poverty rate distribution across ZIP Code Tabulation Areas.") | |
unique_years = sorted(zcta_poverty_melted['year'].unique()) | |
poverty_year = gr.Dropdown(label="Select a Year", choices=unique_years, value=unique_years[0]) | |
poverty_plot = gr.Plot(label="Poverty Rate Map") | |
poverty_year.change(poverty_interface, inputs=poverty_year, outputs=poverty_plot) | |
if __name__ == "__main__": | |
demo.launch() | |