File size: 10,159 Bytes
583b3ab 680da52 583b3ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 |
import streamlit as st
import pandas as pd
from joblib import load
import numpy as np
import matplotlib.pyplot as plt
@st.cache_resource
def load_data() -> pd.DataFrame:
"""
Loads the `.csv` data using pandas
"""
df = pd.read_csv('./lung_disease_data.csv')
numerical_columns = ['Age', 'Lung Capacity', 'Hospital Visits']
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
# Impute categorical features with their mode (most frequent value)
categorical_columns = ['Gender', 'Smoking Status', 'Disease Type', 'Treatment Type', 'Recovered']
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
return df
@st.cache_resource
def load_models() -> dict:
"""
Loads the trained models for prediction.
"""
nb = load('./models/GaussianNB.pkl')
lg = load('./models/LogisticRegression.pkl')
rf = load('./models/RandomForests.pkl')
svm = load('./models/SVM.pkl')
xgb = load('./models/XGBoost.pkl')
models = dict({
'Gaussian Naive Bayes': nb,
'Logistic Regression': lg,
'Random Forest': rf,
'Support Vector Machines': svm,
'XG Boost': xgb
})
return models
def prediction(model, age: int, gender: str,
smoke_status: str, lung_capacity: float,
disease_type: str, treatment_type: str,
hospital_visits: int
) -> int:
df_input = pd.DataFrame(
{'Age': [age],
'Hospital Visits': [hospital_visits],
'Lung Capacity': [lung_capacity],
'Gender': [1 if gender == "Male" else 0],
'Smoking Status': [1 if smoke_status == "Yes" else 0],
'Disease Type_Asthma': [1 if disease_type in 'Disease Type_Asthma' else 0],
'Disease Type_Bronchitis': [1 if disease_type in 'Disease Type_Bronchitis' else 0],
'Disease Type_COPD': [1 if disease_type in 'Disease Type_COPD' else 0],
'Disease Type_Lung Cancer': [1 if disease_type in 'Disease Type_Lung Cancer' else 0],
'Disease Type_Pneumonia': [1 if disease_type in 'Disease Type_Pneumonia' else 0],
'Treatment Type_Medication': [1 if treatment_type in 'Treatment Type_Medication' else 0],
'Treatment Type_Surgery': [1 if treatment_type in 'Treatment Type_Surgery' else 0],
'Treatment Type_Therapy': [1 if treatment_type in 'Treatment Type_Therapy' else 0]
}
)
input_arr = np.array(df_input)
prediction = model.predict(input_arr)[0]
return prediction.item()
def main():
st.header("Lung Disease Recovery Predictor")
st.caption('Prepared by `hydraadra112` | John Manuel Carado')
data_tab, pred_tab, data_viz = st.tabs(['About Data', 'Prediction', 'Data Viz'])
df = load_data()
with data_tab:
st.header('About the Data')
st.caption('In this tab, we will explore the particular details about our data.')
st.caption('Take a look at the data table.')
st.dataframe(df)
col1, col2 = st.columns(2)
with col1:
st.caption('This dataset captures detailed information about patients suffering from various lung conditions. It includes:')
st.caption('**Age & Gender**: Patient demographics to understand the spread across age groups and gender.')
st.caption('**Smoking Status**: Whether the patient is a smoker or non-smoker.')
st.caption('**Lung Capacity**: Measured lung function to assess disease severity.')
st.caption('**Disease Type**: The specific lung condition, like COPD or Bronchitis.')
with col2:
st.caption('**Treatment Type**: Different treatments patients received, including therapy, medication, or surgery.')
st.caption('**Hospital Visits**: Number of visits to the hospital for managing the condition.')
st.caption('**Recovery Status**: Indicates whether the patient recovered after treatment.')
url = 'https://www.kaggle.com/datasets/samikshadalvi/lungs-diseases-dataset'
st.caption('For more details, check out the the original [source](%s) of the dataset.' % url)
with pred_tab:
st.header('Prediction Tab')
st.caption('In this tab, our ML models will predict if you will recover based on your data.')
models = load_models()
model = st.selectbox('Select preferred model for prediction', models.keys())
model_predictor = models[model]
col1, col2 = st.columns(2)
with col1:
# age
age = st.number_input('What is your age?', min_value=0, max_value=100)
gender = st.radio('What is your gender?', df['Gender'].unique())
disease = st.selectbox('What is your lung condition?', df['Disease Type'].unique())
treatment = st.selectbox('Which treatment did you receive?', df['Treatment Type'].unique())
with col2:
visits = st.number_input('How many times do you visit the hospital? (Annually)', min_value=0, max_value=365)
capacity = st.slider('What is your lung capacity?', min_value=1.00, max_value=df['Lung Capacity'].max()+5)
smoke = st.radio('Do you smoke?', ['Yes', 'No'])
if st.button('Predict!'):
pred = prediction(model_predictor, age, gender, smoke, capacity, disease, treatment, visits)
rec = 'Recovered!' if pred == 1 else 'I am sorry.'
st.header(rec)
with data_viz:
st.title('Data Viz Tab')
st.caption('In this tab, we can visualize the relationships among our data.')
st.caption('See our pre-existing plots and you can also plot your own!')
dviz_tab1, dviz_tab2 = st.tabs(['Plots', 'Custom Plot'])
with dviz_tab1:
st.title('Feature Distribution and Relationships')
st.caption('In this tab we will see the feature distributions of the dataset.')
st.caption('We can see the relationships of the features among each other.')
# Create subplots
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(15, 25))
# Age distribution
axes[0, 0].hist(df['Age'])
axes[0, 0].set_xlabel('Age')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Age Distribution')
# Lung Capacity distribution
axes[0, 1].hist(df['Lung Capacity'])
axes[0, 1].set_xlabel('Lung Capacity')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Lung Capacity Distribution')
# Hospital Visits distribution
axes[1, 0].hist(df['Hospital Visits'])
axes[1, 0].set_xlabel('Hospital Visits')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Hospital Visits Distribution')
# Gender vs Recovered
count_data = df.groupby(['Gender', 'Recovered']).size().unstack(fill_value=0)
count_data.plot(kind='bar', stacked=False, ax=axes[1, 1])
axes[1, 1].set_xlabel('Gender')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Gender Count by Recovery')
axes[1, 1].legend(title='Recovered')
# Smoking vs Recovered
count_data = df.groupby(['Smoking Status', 'Recovered']).size().unstack(fill_value=0)
count_data.plot(kind='bar', stacked=False, ax=axes[2, 0])
axes[2, 0].set_xlabel('Smoking Status')
axes[2, 0].set_ylabel('Count')
axes[2, 0].set_title('Smoking Status by Recovery')
axes[2, 0].legend(title='Recovered')
# Disease Type vs Recovered
count_data = df.groupby(['Disease Type', 'Recovered']).size().unstack(fill_value=0)
count_data.plot(kind='bar', stacked=False, ax=axes[2, 1])
axes[2, 1].set_xlabel('Disease Type')
axes[2, 1].set_ylabel('Count')
axes[2, 1].set_title('Disease Type by Recovery')
axes[2, 1].legend(title='Recovered')
# Treatment Type vs Recovered
count_data = df.groupby(['Treatment Type', 'Recovered']).size().unstack(fill_value=0)
count_data.plot(kind='bar', stacked=False, ax=axes[3, 0])
axes[3, 0].set_xlabel('Treatment Type')
axes[3, 0].set_ylabel('Count')
axes[3, 0].set_title('Treatment Type by Recovery')
axes[3, 0].legend(title='Recovered')
# Disease Type vs Treatment Type
count_data = df.groupby(['Disease Type', 'Treatment Type']).size().unstack(fill_value=0)
count_data.plot(kind='bar', stacked=False, ax=axes[3, 1])
axes[3, 1].set_xlabel('Disease Type')
axes[3, 1].set_ylabel('Count')
axes[3, 1].set_title('Disease Type by Treatment Type')
axes[3, 1].legend(title='Treatment')
st.pyplot(fig)
plt.tight_layout()
with dviz_tab2:
x = st.selectbox("Choose X for plotting.", tuple(df.columns))
y = st.selectbox("Choose Y for plotting.", tuple(df.drop(x, axis=1).columns))
plot = st.selectbox("Select type of plot.", ("Scatter", "Bar", "Line"))
if st.button("Plot X and Y!"):
if plot == "Scatter":
st.scatter_chart(
data=df,
x=x,
y=y,
size='Recovered'
)
elif plot == "Bar":
st.bar_chart(
data=df,
x=x,
y=y
)
elif plot == "Line":
st.line_chart(
data=df,
x=x,
y=y
)
if __name__ == "__main__":
main() |