Spaces:
Running
Running
File size: 17,226 Bytes
ab05130 e9eeeff ab05130 a2a4759 ab05130 a2a4759 ab05130 a2a4759 ab05130 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 |
import streamlit as st
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
import folium
from streamlit_folium import st_folium
from datetime import timedelta
# ----------------------------------------------------
# 1. Load data
# ----------------------------------------------------
@st.cache_data
def load_data():
# Load daily and monthly CSV from local files (or a URL if needed)
daily_df = pd.read_csv("daily_data_2013_2024.csv", parse_dates=["date"])
monthly_df = pd.read_csv("monthly_data_2013_2024.csv")
# If monthly_df also needs a 'date' column for plotting, you can create:
# monthly_df["date"] = pd.to_datetime(monthly_df["year"].astype(str) + "-" + monthly_df["month"].astype(str) + "-01")
return daily_df, monthly_df
daily_data, monthly_data = load_data()
# Pre-define your location dictionary so we can map lat/lon
LOCATIONS = {
"Karagwe": {"lat": -1.7718, "lon": 30.9876},
"Masasi": {"lat": -10.7167, "lon": 38.8000},
"Igunga": {"lat": -4.2833, "lon": 33.8833}
}
# ----------------------------------------------------
# 2. Streamlit UI Layout
# ----------------------------------------------------
st.title("Malaria & Dengue Outbreak Analysis (2013–2024)")
st.sidebar.header("Filters & Options")
# Choose disease type to focus on
disease_choice = st.sidebar.radio("Select Disease", ["Malaria", "Dengue"], index=0)
# Choose data granularity
data_choice = st.sidebar.radio("Data Granularity", ["Monthly", "Daily"], index=0)
# Let user filter location(s)
location_list = list(LOCATIONS.keys())
selected_locations = st.sidebar.multiselect("Select Location(s)", location_list, default=location_list)
# For monthly data, let user select a year range
if data_choice == "Monthly":
year_min = int(monthly_data["year"].min())
year_max = int(monthly_data["year"].max())
year_range = st.sidebar.slider(
"Select Year Range",
min_value=year_min,
max_value=year_max,
value=(year_min, year_max),
step=1
)
# For daily data, let user select a date range
else:
date_min = daily_data["date"].min()
date_max = daily_data["date"].max()
date_range = st.sidebar.date_input(
"Select Date Range",
[date_min, date_max],
min_value=date_min,
max_value=date_max
)
# ----------------------------------------------------
# 3. Filter data based on user input
# ----------------------------------------------------
if data_choice == "Monthly":
# Subset monthly data for selected locations
df = monthly_data[monthly_data["location"].isin(selected_locations)].copy()
# Filter year range
df = df[(df["year"] >= year_range[0]) & (df["year"] <= year_range[1])]
# Create a "date" column for monthly plotting
df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-01")
else:
# Subset daily data
df = daily_data[daily_data["location"].isin(selected_locations)].copy()
# Filter date range
df = df[(df["date"] >= pd.to_datetime(date_range[0])) & (df["date"] <= pd.to_datetime(date_range[1]))]
# ----------------------------------------------------
# 4. Interactive Plotly Time-Series (Original)
# ----------------------------------------------------
st.subheader(f"{data_choice} {disease_choice} Risk & Climate Parameters")
# Decide which columns are relevant for risk
risk_col = "malaria_risk" if disease_choice == "Malaria" else "dengue_risk"
if data_choice == "Monthly":
# Plot a line chart of risk vs. date
fig = px.line(
df,
x="date",
y=risk_col,
color="location",
title=f"{disease_choice} Risk Over Time ({data_choice})"
)
fig.update_layout(yaxis_title="Risk (0–1)")
st.plotly_chart(fig, use_container_width=True)
# Temperature & Rainfall side-by-side
col1, col2 = st.columns(2)
with col1:
fig_temp = px.line(
df,
x="date",
y="temp_avg",
color="location",
title="Average Temperature (°C)"
)
st.plotly_chart(fig_temp, use_container_width=True)
with col2:
# 'monthly_rainfall_mm' is total monthly rainfall
fig_rain = px.line(
df,
x="date",
y="monthly_rainfall_mm",
color="location",
title="Monthly Rainfall (mm)"
)
st.plotly_chart(fig_rain, use_container_width=True)
# Show outbreak flags if focusing on monthly
if disease_choice == "Malaria":
flag_col = "malaria_outbreak"
else:
flag_col = "dengue_outbreak"
outbreak_months = df[df[flag_col] == True]
if not outbreak_months.empty:
st.write(f"**Months with likely {disease_choice} outbreak:**")
st.dataframe(outbreak_months[[
"location","year","month","temp_avg",
"humidity","monthly_rainfall_mm",flag_col
]])
else:
st.write(f"No months meet the {disease_choice} outbreak criteria in this selection.")
else:
# For daily data, plot daily risk
fig = px.line(
df,
x="date",
y=risk_col,
color="location",
title=f"{disease_choice} Daily Risk Over Time (2013–2024)"
)
fig.update_layout(yaxis_title="Risk (0–1)")
st.plotly_chart(fig, use_container_width=True)
# Temperature & Rainfall side-by-side
col1, col2 = st.columns(2)
with col1:
fig_temp = px.line(
df,
x="date",
y="temp_avg",
color="location",
title="Daily Avg Temperature (°C)"
)
st.plotly_chart(fig_temp, use_container_width=True)
with col2:
fig_rain = px.line(
df,
x="date",
y="daily_rainfall_mm",
color="location",
title="Daily Rainfall (mm)"
)
st.plotly_chart(fig_rain, use_container_width=True)
# ----------------------------------------------------
# 5. Correlation Heatmap (Original)
# ----------------------------------------------------
st.subheader(f"Correlation Heatmap - {data_choice} Data")
# Option to choose correlation method
corr_method = st.selectbox("Correlation Method", ["pearson", "spearman"], index=0)
# We'll pick relevant numeric columns
if data_choice == "Monthly":
subset_cols = ["temp_avg", "humidity", "monthly_rainfall_mm", "malaria_risk", "dengue_risk"]
else:
subset_cols = ["temp_avg", "humidity", "daily_rainfall_mm", "malaria_risk", "dengue_risk"]
corr_df = df[subset_cols].corr(method=corr_method)
fig_corr = px.imshow(
corr_df,
text_auto=True,
aspect="auto",
title=f"Correlation Matrix of Weather & Risk ({corr_method.capitalize()})"
)
st.plotly_chart(fig_corr, use_container_width=True)
# ----------------------------------------------------
# 6. Interactive Map (Original)
# ----------------------------------------------------
st.subheader("Interactive Map")
st.markdown(
"""
**Note**: We only have 3 locations. Each marker popup shows some aggregated
stats for the displayed data range.
"""
)
# Create a base map centered roughly in Tanzania
m = folium.Map(location=[-6.0, 35.0], zoom_start=6)
# Show monthly or daily aggregates in the popups
if data_choice == "Monthly":
for loc in selected_locations:
loc_info = LOCATIONS[loc]
loc_df = df[df["location"] == loc]
if loc_df.empty:
continue
# Basic stats
avg_risk = loc_df[risk_col].mean()
avg_temp = loc_df["temp_avg"].mean()
avg_rain = loc_df["monthly_rainfall_mm"].mean()
# Build popup HTML
popup_html = f"""
<b>{loc}</b><br/>
Disease: {disease_choice}<br/>
Avg Risk (in selection): {avg_risk:.2f}<br/>
Avg Temp (°C): {avg_temp:.2f}<br/>
Avg Rainfall (mm): {avg_rain:.2f}<br/>
"""
folium.Marker(
location=[loc_info["lat"], loc_info["lon"]],
popup=popup_html,
tooltip=f"{loc} ({disease_choice})"
).add_to(m)
else:
# Daily data
for loc in selected_locations:
loc_info = LOCATIONS[loc]
loc_df = df[df["location"] == loc]
if loc_df.empty:
continue
avg_risk = loc_df[risk_col].mean()
avg_temp = loc_df["temp_avg"].mean()
avg_rain = loc_df["daily_rainfall_mm"].mean()
popup_html = f"""
<b>{loc}</b><br/>
Disease: {disease_choice}<br/>
Avg Risk (in selection): {avg_risk:.2f}<br/>
Avg Temp (°C): {avg_temp:.2f}<br/>
Avg Rain (mm/day): {avg_rain:.2f}<br/>
"""
folium.Marker(
location=[loc_info["lat"], loc_info["lon"]],
popup=popup_html,
tooltip=f"{loc} ({disease_choice})"
).add_to(m)
# Render Folium map in Streamlit
st_data = st_folium(m, width=700, height=500)
# ----------------------------------------------------
# 7. Additional Explorations (New Features)
# ----------------------------------------------------
st.header("Additional Explorations")
###############################################################################
# 7.1 Compare Malaria & Dengue Risk Side-by-Side (same chart) for the same data
###############################################################################
st.subheader("Compare Malaria & Dengue Risk Over Time")
compare_both = st.checkbox("Compare Both Diseases on One Plot")
if compare_both:
# We'll create two columns for Malaria & Dengue in the same DF subset
# Already have "malaria_risk" and "dengue_risk" in the data
# Filter the same df but plot them together:
# Convert to "long" format for easy plotting with Plotly
# e.g. columns: date, location, disease, risk
if data_choice == "Monthly":
# We have date, location, malaria_risk, dengue_risk
df_long = df.melt(
id_vars=["date","location","temp_avg","humidity"],
value_vars=["malaria_risk","dengue_risk"],
var_name="disease",
value_name="risk"
)
else:
df_long = df.melt(
id_vars=["date","location","temp_avg","humidity"],
value_vars=["malaria_risk","dengue_risk"],
var_name="disease",
value_name="risk"
)
# We only want to show locations user selected, but the df is already filtered
# so just plot:
title_str = "Malaria vs. Dengue Risk"
fig_compare = px.line(
df_long,
x="date",
y="risk",
color="location",
line_dash="disease",
title=title_str
)
fig_compare.update_layout(yaxis_title="Risk (0–1)")
st.plotly_chart(fig_compare, use_container_width=True)
##################################################
# 7.2 Scatter Matrix (Pairwise relationships)
##################################################
st.subheader("Scatter Matrix of Risk & Weather Parameters")
# Let user choose which columns to include (besides the default subset)
scatter_cols = st.multiselect(
"Choose additional columns to include in Scatter Matrix (besides risk & weather).",
["temp_avg","humidity","monthly_rainfall_mm","daily_rainfall_mm","malaria_risk","dengue_risk"],
default=["temp_avg","humidity","malaria_risk","dengue_risk"]
)
if len(scatter_cols) < 2:
st.warning("Please select at least two columns to generate a scatter matrix.")
else:
# Prepare data for scatter matrix
sm_df = df[scatter_cols].copy()
# For monthly vs daily, the rainfall column might differ
# If user selected 'monthly_rainfall_mm' but the data is daily, that column might not exist.
# So we can drop missing columns gracefully:
sm_df = sm_df.dropna(axis=1, how='all')
# Using Plotly's scatter_matrix:
fig_sm = px.scatter_matrix(
sm_df,
dimensions=sm_df.columns,
title="Scatter Matrix",
color_discrete_sequence=["#636EFA"] # Adjust color if you like
)
fig_sm.update_layout(width=800, height=800)
st.plotly_chart(fig_sm, use_container_width=True)
##################################################
# 7.3 Simple Time-Lag Correlation (Example)
##################################################
st.subheader("Time-Lag Correlation ⚠️ NEEDS NIMR Data to work")
st.markdown("""
Experiment with a simple lag analysis. For example, check how
temperature or rainfall in previous weeks/months correlates with **current**
Malaria/Dengue risk.
""")
time_lag = st.slider("Select Lag (days) to shift weather parameters", min_value=0, max_value=60, value=0, step=5)
# Example: Shift rainfall & temperature columns by the selected lag and see correlation with disease risk
df_lag = df.copy()
if data_choice == "Daily" and time_lag > 0:
# Shift daily rainfall/temperature backward by 'time_lag' days
df_lag = df_lag.sort_values("date") # ensure sorted by date
df_lag["temp_avg_lag"] = df_lag.groupby("location")["temp_avg"].shift(time_lag)
df_lag["rain_lag"] = df_lag.groupby("location")["daily_rainfall_mm"].shift(time_lag)
# If we want to see correlation with today's risk
# we can drop rows with NaN in the lag columns
df_lag.dropna(subset=["temp_avg_lag","rain_lag"], inplace=True)
elif data_choice == "Monthly" and time_lag > 0:
# Shift monthly rainfall & temp by 'time_lag' (in days) => must approximate?
# We'll interpret the slider as months if data is monthly.
# But that might not be precise if "time_lag" is in days. For simplicity, we convert days -> months ~ 30 days
month_lag = time_lag // 30 # approximate conversion
if month_lag > 0:
df_lag = df_lag.sort_values("date")
df_lag["temp_avg_lag"] = df_lag.groupby("location")["temp_avg"].shift(month_lag)
df_lag["rain_lag"] = df_lag.groupby("location")["monthly_rainfall_mm"].shift(month_lag)
df_lag.dropna(subset=["temp_avg_lag","rain_lag"], inplace=True)
# Now we compute correlation between risk_col and these lagged columns, if they exist
if "temp_avg_lag" in df_lag.columns and "rain_lag" in df_lag.columns:
lag_corr_temp = df_lag[risk_col].corr(df_lag["temp_avg_lag"], method=corr_method)
lag_corr_rain = df_lag[risk_col].corr(df_lag["rain_lag"], method=corr_method)
st.write(f"**Correlation between {disease_choice} Risk and lagged Temperature**: {lag_corr_temp:.3f}")
st.write(f"**Correlation between {disease_choice} Risk and lagged Rainfall**: {lag_corr_rain:.3f}")
else:
st.write("No lag columns or lag is set to 0. Increase the lag to see results.")
##################################################
# 7.4 Outbreak Statistics
##################################################
st.subheader("Outbreak Statistics - ⚠️ NEEDS NIMR Data to work")
st.markdown("""
This section will show the **count** of outbreak periods based on selection
and some summary statistics, once we have overlayed NIMR Data with the Existing Weather Data
""")
if disease_choice == "Malaria":
outbreak_flag_col = "malaria_outbreak"
else:
outbreak_flag_col = "dengue_outbreak"
# Summarize outbreak by location
if outbreak_flag_col in df.columns:
outbreak_count_by_loc = df[df[outbreak_flag_col] == True].groupby("location").size().reset_index(name="outbreak_count")
st.write("**Number of outbreak instances (in current selection) by location:**")
st.dataframe(outbreak_count_by_loc)
else:
st.write(f"No outbreak flag column found for {disease_choice}.")
# Show average temperature, rainfall, humidity during outbreak vs non-outbreak
if outbreak_flag_col in df.columns:
with st.expander("Compare Weather Averages During Outbreak vs. Non-Outbreak"):
outbreak_df = df[df[outbreak_flag_col] == True]
non_outbreak_df = df[df[outbreak_flag_col] == False]
if not outbreak_df.empty:
avg_temp_outbreak = outbreak_df["temp_avg"].mean()
avg_hum_outbreak = outbreak_df["humidity"].mean()
if data_choice == "Daily":
avg_rain_outbreak = outbreak_df["daily_rainfall_mm"].mean()
else:
avg_rain_outbreak = outbreak_df["monthly_rainfall_mm"].mean()
avg_temp_non = non_outbreak_df["temp_avg"].mean()
avg_hum_non = non_outbreak_df["humidity"].mean()
if data_choice == "Daily":
avg_rain_non = non_outbreak_df["daily_rainfall_mm"].mean()
else:
avg_rain_non = non_outbreak_df["monthly_rainfall_mm"].mean()
st.write(f"**Outbreak Periods** ({disease_choice}):")
st.write(f"- Avg Temperature: {avg_temp_outbreak:.2f} °C")
st.write(f"- Avg Humidity: {avg_hum_outbreak:.2f}%")
st.write(f"- Avg Rainfall: {avg_rain_outbreak:.2f} mm")
st.write(f"**Non-Outbreak Periods** ({disease_choice}):")
st.write(f"- Avg Temperature: {avg_temp_non:.2f} °C")
st.write(f"- Avg Humidity: {avg_hum_non:.2f}%")
st.write(f"- Avg Rainfall: {avg_rain_non:.2f} mm")
else:
st.write(f"No {disease_choice} outbreaks found in the current selection.")
|