Spaces:

lyimo
/

MALARIA_DENGUE

Running

App Files Files Community

MALARIA_DENGUE / app.py

lyimo

Update app.py

e9eeeff verified 5 months ago

raw

history blame contribute delete

17.2 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import plotly.express as px
	import plotly.graph_objs as go
	import folium
	from streamlit_folium import st_folium
	from datetime import timedelta

	# ----------------------------------------------------
	# 1. Load data
	# ----------------------------------------------------
	@st.cache_data
	def load_data():
	# Load daily and monthly CSV from local files (or a URL if needed)
	daily_df = pd.read_csv("daily_data_2013_2024.csv", parse_dates=["date"])
	monthly_df = pd.read_csv("monthly_data_2013_2024.csv")
	# If monthly_df also needs a 'date' column for plotting, you can create:
	# monthly_df["date"] = pd.to_datetime(monthly_df["year"].astype(str) + "-" + monthly_df["month"].astype(str) + "-01")
	return daily_df, monthly_df

	daily_data, monthly_data = load_data()

	# Pre-define your location dictionary so we can map lat/lon
	LOCATIONS = {
	"Karagwe": {"lat": -1.7718, "lon": 30.9876},
	"Masasi": {"lat": -10.7167, "lon": 38.8000},
	"Igunga": {"lat": -4.2833, "lon": 33.8833}
	}

	# ----------------------------------------------------
	# 2. Streamlit UI Layout
	# ----------------------------------------------------
	st.title("Malaria & Dengue Outbreak Analysis (2013–2024)")

	st.sidebar.header("Filters & Options")

	# Choose disease type to focus on
	disease_choice = st.sidebar.radio("Select Disease", ["Malaria", "Dengue"], index=0)

	# Choose data granularity
	data_choice = st.sidebar.radio("Data Granularity", ["Monthly", "Daily"], index=0)

	# Let user filter location(s)
	location_list = list(LOCATIONS.keys())
	selected_locations = st.sidebar.multiselect("Select Location(s)", location_list, default=location_list)

	# For monthly data, let user select a year range
	if data_choice == "Monthly":
	year_min = int(monthly_data["year"].min())
	year_max = int(monthly_data["year"].max())
	year_range = st.sidebar.slider(
	"Select Year Range",
	min_value=year_min,
	max_value=year_max,
	value=(year_min, year_max),
	step=1
	)
	# For daily data, let user select a date range
	else:
	date_min = daily_data["date"].min()
	date_max = daily_data["date"].max()
	date_range = st.sidebar.date_input(
	"Select Date Range",
	[date_min, date_max],
	min_value=date_min,
	max_value=date_max
	)

	# ----------------------------------------------------
	# 3. Filter data based on user input
	# ----------------------------------------------------
	if data_choice == "Monthly":
	# Subset monthly data for selected locations
	df = monthly_data[monthly_data["location"].isin(selected_locations)].copy()
	# Filter year range
	df = df[(df["year"] >= year_range[0]) & (df["year"] <= year_range[1])]

	# Create a "date" column for monthly plotting
	df["date"] = pd.to_datetime(df["year"].astype(str) + "-" + df["month"].astype(str) + "-01")

	else:
	# Subset daily data
	df = daily_data[daily_data["location"].isin(selected_locations)].copy()
	# Filter date range
	df = df[(df["date"] >= pd.to_datetime(date_range[0])) & (df["date"] <= pd.to_datetime(date_range[1]))]

	# ----------------------------------------------------
	# 4. Interactive Plotly Time-Series (Original)
	# ----------------------------------------------------
	st.subheader(f"{data_choice} {disease_choice} Risk & Climate Parameters")

	# Decide which columns are relevant for risk
	risk_col = "malaria_risk" if disease_choice == "Malaria" else "dengue_risk"

	if data_choice == "Monthly":
	# Plot a line chart of risk vs. date
	fig = px.line(
	df,
	x="date",
	y=risk_col,
	color="location",
	title=f"{disease_choice} Risk Over Time ({data_choice})"
	)
	fig.update_layout(yaxis_title="Risk (0–1)")
	st.plotly_chart(fig, use_container_width=True)

	# Temperature & Rainfall side-by-side
	col1, col2 = st.columns(2)
	with col1:
	fig_temp = px.line(
	df,
	x="date",
	y="temp_avg",
	color="location",
	title="Average Temperature (°C)"
	)
	st.plotly_chart(fig_temp, use_container_width=True)
	with col2:
	# 'monthly_rainfall_mm' is total monthly rainfall
	fig_rain = px.line(
	df,
	x="date",
	y="monthly_rainfall_mm",
	color="location",
	title="Monthly Rainfall (mm)"
	)
	st.plotly_chart(fig_rain, use_container_width=True)

	# Show outbreak flags if focusing on monthly
	if disease_choice == "Malaria":
	flag_col = "malaria_outbreak"
	else:
	flag_col = "dengue_outbreak"

	outbreak_months = df[df[flag_col] == True]
	if not outbreak_months.empty:
	st.write(f"Months with likely {disease_choice} outbreak:")
	st.dataframe(outbreak_months[[
	"location","year","month","temp_avg",
	"humidity","monthly_rainfall_mm",flag_col
	]])
	else:
	st.write(f"No months meet the {disease_choice} outbreak criteria in this selection.")

	else:
	# For daily data, plot daily risk
	fig = px.line(
	df,
	x="date",
	y=risk_col,
	color="location",
	title=f"{disease_choice} Daily Risk Over Time (2013–2024)"
	)
	fig.update_layout(yaxis_title="Risk (0–1)")
	st.plotly_chart(fig, use_container_width=True)

	# Temperature & Rainfall side-by-side
	col1, col2 = st.columns(2)
	with col1:
	fig_temp = px.line(
	df,
	x="date",
	y="temp_avg",
	color="location",
	title="Daily Avg Temperature (°C)"
	)
	st.plotly_chart(fig_temp, use_container_width=True)
	with col2:
	fig_rain = px.line(
	df,
	x="date",
	y="daily_rainfall_mm",
	color="location",
	title="Daily Rainfall (mm)"
	)
	st.plotly_chart(fig_rain, use_container_width=True)

	# ----------------------------------------------------
	# 5. Correlation Heatmap (Original)
	# ----------------------------------------------------
	st.subheader(f"Correlation Heatmap - {data_choice} Data")

	# Option to choose correlation method
	corr_method = st.selectbox("Correlation Method", ["pearson", "spearman"], index=0)

	# We'll pick relevant numeric columns
	if data_choice == "Monthly":
	subset_cols = ["temp_avg", "humidity", "monthly_rainfall_mm", "malaria_risk", "dengue_risk"]
	else:
	subset_cols = ["temp_avg", "humidity", "daily_rainfall_mm", "malaria_risk", "dengue_risk"]

	corr_df = df[subset_cols].corr(method=corr_method)
	fig_corr = px.imshow(
	corr_df,
	text_auto=True,
	aspect="auto",
	title=f"Correlation Matrix of Weather & Risk ({corr_method.capitalize()})"
	)
	st.plotly_chart(fig_corr, use_container_width=True)

	# ----------------------------------------------------
	# 6. Interactive Map (Original)
	# ----------------------------------------------------
	st.subheader("Interactive Map")
	st.markdown(
	"""
	Note: We only have 3 locations. Each marker popup shows some aggregated
	stats for the displayed data range.
	"""
	)

	# Create a base map centered roughly in Tanzania
	m = folium.Map(location=[-6.0, 35.0], zoom_start=6)

	# Show monthly or daily aggregates in the popups
	if data_choice == "Monthly":
	for loc in selected_locations:
	loc_info = LOCATIONS[loc]
	loc_df = df[df["location"] == loc]
	if loc_df.empty:
	continue
	# Basic stats
	avg_risk = loc_df[risk_col].mean()
	avg_temp = loc_df["temp_avg"].mean()
	avg_rain = loc_df["monthly_rainfall_mm"].mean()

	# Build popup HTML
	popup_html = f"""
	<b>{loc}</b><br/>
	Disease: {disease_choice}<br/>
	Avg Risk (in selection): {avg_risk:.2f}<br/>
	Avg Temp (°C): {avg_temp:.2f}<br/>
	Avg Rainfall (mm): {avg_rain:.2f}<br/>
	"""
	folium.Marker(
	location=[loc_info["lat"], loc_info["lon"]],
	popup=popup_html,
	tooltip=f"{loc} ({disease_choice})"
	).add_to(m)
	else:
	# Daily data
	for loc in selected_locations:
	loc_info = LOCATIONS[loc]
	loc_df = df[df["location"] == loc]
	if loc_df.empty:
	continue
	avg_risk = loc_df[risk_col].mean()
	avg_temp = loc_df["temp_avg"].mean()
	avg_rain = loc_df["daily_rainfall_mm"].mean()

	popup_html = f"""
	<b>{loc}</b><br/>
	Disease: {disease_choice}<br/>
	Avg Risk (in selection): {avg_risk:.2f}<br/>
	Avg Temp (°C): {avg_temp:.2f}<br/>
	Avg Rain (mm/day): {avg_rain:.2f}<br/>
	"""
	folium.Marker(
	location=[loc_info["lat"], loc_info["lon"]],
	popup=popup_html,
	tooltip=f"{loc} ({disease_choice})"
	).add_to(m)

	# Render Folium map in Streamlit
	st_data = st_folium(m, width=700, height=500)

	# ----------------------------------------------------
	# 7. Additional Explorations (New Features)
	# ----------------------------------------------------
	st.header("Additional Explorations")

	###############################################################################
	# 7.1 Compare Malaria & Dengue Risk Side-by-Side (same chart) for the same data
	###############################################################################
	st.subheader("Compare Malaria & Dengue Risk Over Time")
	compare_both = st.checkbox("Compare Both Diseases on One Plot")

	if compare_both:
	# We'll create two columns for Malaria & Dengue in the same DF subset
	# Already have "malaria_risk" and "dengue_risk" in the data
	# Filter the same df but plot them together:

	# Convert to "long" format for easy plotting with Plotly
	# e.g. columns: date, location, disease, risk
	if data_choice == "Monthly":
	# We have date, location, malaria_risk, dengue_risk
	df_long = df.melt(
	id_vars=["date","location","temp_avg","humidity"],
	value_vars=["malaria_risk","dengue_risk"],
	var_name="disease",
	value_name="risk"
	)
	else:
	df_long = df.melt(
	id_vars=["date","location","temp_avg","humidity"],
	value_vars=["malaria_risk","dengue_risk"],
	var_name="disease",
	value_name="risk"
	)

	# We only want to show locations user selected, but the df is already filtered
	# so just plot:
	title_str = "Malaria vs. Dengue Risk"
	fig_compare = px.line(
	df_long,
	x="date",
	y="risk",
	color="location",
	line_dash="disease",
	title=title_str
	)
	fig_compare.update_layout(yaxis_title="Risk (0–1)")
	st.plotly_chart(fig_compare, use_container_width=True)

	##################################################
	# 7.2 Scatter Matrix (Pairwise relationships)
	##################################################
	st.subheader("Scatter Matrix of Risk & Weather Parameters")

	# Let user choose which columns to include (besides the default subset)
	scatter_cols = st.multiselect(
	"Choose additional columns to include in Scatter Matrix (besides risk & weather).",
	["temp_avg","humidity","monthly_rainfall_mm","daily_rainfall_mm","malaria_risk","dengue_risk"],
	default=["temp_avg","humidity","malaria_risk","dengue_risk"]
	)

	if len(scatter_cols) < 2:
	st.warning("Please select at least two columns to generate a scatter matrix.")
	else:
	# Prepare data for scatter matrix
	sm_df = df[scatter_cols].copy()
	# For monthly vs daily, the rainfall column might differ
	# If user selected 'monthly_rainfall_mm' but the data is daily, that column might not exist.
	# So we can drop missing columns gracefully:
	sm_df = sm_df.dropna(axis=1, how='all')

	# Using Plotly's scatter_matrix:
	fig_sm = px.scatter_matrix(
	sm_df,
	dimensions=sm_df.columns,
	title="Scatter Matrix",
	color_discrete_sequence=["#636EFA"] # Adjust color if you like
	)
	fig_sm.update_layout(width=800, height=800)
	st.plotly_chart(fig_sm, use_container_width=True)

	##################################################
	# 7.3 Simple Time-Lag Correlation (Example)
	##################################################
	st.subheader("Time-Lag Correlation ⚠️ NEEDS NIMR Data to work")

	st.markdown("""
	Experiment with a simple lag analysis. For example, check how
	temperature or rainfall in previous weeks/months correlates with current
	Malaria/Dengue risk.
	""")

	time_lag = st.slider("Select Lag (days) to shift weather parameters", min_value=0, max_value=60, value=0, step=5)

	# Example: Shift rainfall & temperature columns by the selected lag and see correlation with disease risk
	df_lag = df.copy()

	if data_choice == "Daily" and time_lag > 0:
	# Shift daily rainfall/temperature backward by 'time_lag' days
	df_lag = df_lag.sort_values("date") # ensure sorted by date
	df_lag["temp_avg_lag"] = df_lag.groupby("location")["temp_avg"].shift(time_lag)
	df_lag["rain_lag"] = df_lag.groupby("location")["daily_rainfall_mm"].shift(time_lag)
	# If we want to see correlation with today's risk
	# we can drop rows with NaN in the lag columns
	df_lag.dropna(subset=["temp_avg_lag","rain_lag"], inplace=True)

	elif data_choice == "Monthly" and time_lag > 0:
	# Shift monthly rainfall & temp by 'time_lag' (in days) => must approximate?
	# We'll interpret the slider as months if data is monthly.
	# But that might not be precise if "time_lag" is in days. For simplicity, we convert days -> months ~ 30 days
	month_lag = time_lag // 30 # approximate conversion
	if month_lag > 0:
	df_lag = df_lag.sort_values("date")
	df_lag["temp_avg_lag"] = df_lag.groupby("location")["temp_avg"].shift(month_lag)
	df_lag["rain_lag"] = df_lag.groupby("location")["monthly_rainfall_mm"].shift(month_lag)
	df_lag.dropna(subset=["temp_avg_lag","rain_lag"], inplace=True)

	# Now we compute correlation between risk_col and these lagged columns, if they exist
	if "temp_avg_lag" in df_lag.columns and "rain_lag" in df_lag.columns:
	lag_corr_temp = df_lag[risk_col].corr(df_lag["temp_avg_lag"], method=corr_method)
	lag_corr_rain = df_lag[risk_col].corr(df_lag["rain_lag"], method=corr_method)

	st.write(f"Correlation between {disease_choice} Risk and lagged Temperature: {lag_corr_temp:.3f}")
	st.write(f"Correlation between {disease_choice} Risk and lagged Rainfall: {lag_corr_rain:.3f}")
	else:
	st.write("No lag columns or lag is set to 0. Increase the lag to see results.")

	##################################################
	# 7.4 Outbreak Statistics
	##################################################
	st.subheader("Outbreak Statistics - ⚠️ NEEDS NIMR Data to work")

	st.markdown("""
	This section will show the count of outbreak periods based on selection
	and some summary statistics, once we have overlayed NIMR Data with the Existing Weather Data
	""")

	if disease_choice == "Malaria":
	outbreak_flag_col = "malaria_outbreak"
	else:
	outbreak_flag_col = "dengue_outbreak"

	# Summarize outbreak by location
	if outbreak_flag_col in df.columns:
	outbreak_count_by_loc = df[df[outbreak_flag_col] == True].groupby("location").size().reset_index(name="outbreak_count")
	st.write("Number of outbreak instances (in current selection) by location:")
	st.dataframe(outbreak_count_by_loc)
	else:
	st.write(f"No outbreak flag column found for {disease_choice}.")

	# Show average temperature, rainfall, humidity during outbreak vs non-outbreak
	if outbreak_flag_col in df.columns:
	with st.expander("Compare Weather Averages During Outbreak vs. Non-Outbreak"):
	outbreak_df = df[df[outbreak_flag_col] == True]
	non_outbreak_df = df[df[outbreak_flag_col] == False]

	if not outbreak_df.empty:
	avg_temp_outbreak = outbreak_df["temp_avg"].mean()
	avg_hum_outbreak = outbreak_df["humidity"].mean()
	if data_choice == "Daily":
	avg_rain_outbreak = outbreak_df["daily_rainfall_mm"].mean()
	else:
	avg_rain_outbreak = outbreak_df["monthly_rainfall_mm"].mean()

	avg_temp_non = non_outbreak_df["temp_avg"].mean()
	avg_hum_non = non_outbreak_df["humidity"].mean()
	if data_choice == "Daily":
	avg_rain_non = non_outbreak_df["daily_rainfall_mm"].mean()
	else:
	avg_rain_non = non_outbreak_df["monthly_rainfall_mm"].mean()

	st.write(f"Outbreak Periods ({disease_choice}):")
	st.write(f"- Avg Temperature: {avg_temp_outbreak:.2f} °C")
	st.write(f"- Avg Humidity: {avg_hum_outbreak:.2f}%")
	st.write(f"- Avg Rainfall: {avg_rain_outbreak:.2f} mm")

	st.write(f"Non-Outbreak Periods ({disease_choice}):")
	st.write(f"- Avg Temperature: {avg_temp_non:.2f} °C")
	st.write(f"- Avg Humidity: {avg_hum_non:.2f}%")
	st.write(f"- Avg Rainfall: {avg_rain_non:.2f} mm")
	else:
	st.write(f"No {disease_choice} outbreaks found in the current selection.")