Commit
·
583b3ab
1
Parent(s):
c2f6ae2
Created app
Browse files- app.py +238 -0
- model.ipynb +0 -0
- models/LogisticRegression.pkl +1 -1
- models/RandomForests.pkl +2 -2
app.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from joblib import load
|
4 |
+
import numpy as np
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
|
7 |
+
@st.cache_resource
|
8 |
+
def load_data() -> pd.DataFrame:
|
9 |
+
"""
|
10 |
+
Loads the `.csv` data using pandas
|
11 |
+
"""
|
12 |
+
df = pd.read_csv('./lung_disease_data.csv')
|
13 |
+
|
14 |
+
numerical_columns = ['Age', 'Lung Capacity', 'Hospital Visits']
|
15 |
+
df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())
|
16 |
+
|
17 |
+
# Impute categorical features with their mode (most frequent value)
|
18 |
+
categorical_columns = ['Gender', 'Smoking Status', 'Disease Type', 'Treatment Type', 'Recovered']
|
19 |
+
df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
|
20 |
+
|
21 |
+
return df
|
22 |
+
|
23 |
+
@st.cache_resource
|
24 |
+
def load_models() -> dict:
|
25 |
+
"""
|
26 |
+
Loads the trained models for prediction.
|
27 |
+
"""
|
28 |
+
nb = load('./models/GaussianNB.pkl')
|
29 |
+
lg = load('./models/LogisticRegression.pkl')
|
30 |
+
rf = load('./models/RandomForests.pkl')
|
31 |
+
svm = load('./models/SVM.pkl')
|
32 |
+
xgb = load('./models/XGboost.pkl')
|
33 |
+
|
34 |
+
models = dict({
|
35 |
+
'Gaussian Naive Bayes': nb,
|
36 |
+
'Logistic Regression': lg,
|
37 |
+
'Random Forest': rf,
|
38 |
+
'Support Vector Machines': svm,
|
39 |
+
'XG Boost': xgb
|
40 |
+
})
|
41 |
+
|
42 |
+
return models
|
43 |
+
|
44 |
+
def prediction(model, age: int, gender: str,
|
45 |
+
smoke_status: str, lung_capacity: float,
|
46 |
+
disease_type: str, treatment_type: str,
|
47 |
+
hospital_visits: int
|
48 |
+
) -> int:
|
49 |
+
|
50 |
+
df_input = pd.DataFrame(
|
51 |
+
{'Age': [age],
|
52 |
+
'Hospital Visits': [hospital_visits],
|
53 |
+
'Lung Capacity': [lung_capacity],
|
54 |
+
'Gender': [1 if gender == "Male" else 0],
|
55 |
+
'Smoking Status': [1 if smoke_status == "Yes" else 0],
|
56 |
+
'Disease Type_Asthma': [1 if disease_type in 'Disease Type_Asthma' else 0],
|
57 |
+
'Disease Type_Bronchitis': [1 if disease_type in 'Disease Type_Bronchitis' else 0],
|
58 |
+
'Disease Type_COPD': [1 if disease_type in 'Disease Type_COPD' else 0],
|
59 |
+
'Disease Type_Lung Cancer': [1 if disease_type in 'Disease Type_Lung Cancer' else 0],
|
60 |
+
'Disease Type_Pneumonia': [1 if disease_type in 'Disease Type_Pneumonia' else 0],
|
61 |
+
|
62 |
+
'Treatment Type_Medication': [1 if treatment_type in 'Treatment Type_Medication' else 0],
|
63 |
+
'Treatment Type_Surgery': [1 if treatment_type in 'Treatment Type_Surgery' else 0],
|
64 |
+
'Treatment Type_Therapy': [1 if treatment_type in 'Treatment Type_Therapy' else 0]
|
65 |
+
}
|
66 |
+
)
|
67 |
+
|
68 |
+
input_arr = np.array(df_input)
|
69 |
+
|
70 |
+
prediction = model.predict(input_arr)[0]
|
71 |
+
|
72 |
+
return prediction.item()
|
73 |
+
|
74 |
+
def main():
|
75 |
+
st.header("Lung Disease Recovery Predictor")
|
76 |
+
st.caption('Prepared by `hydraadra112` | John Manuel Carado')
|
77 |
+
|
78 |
+
data_tab, pred_tab, data_viz = st.tabs(['About Data', 'Prediction', 'Data Viz'])
|
79 |
+
df = load_data()
|
80 |
+
|
81 |
+
with data_tab:
|
82 |
+
st.header('About the Data')
|
83 |
+
st.caption('In this tab, we will explore the particular details about our data.')
|
84 |
+
|
85 |
+
st.caption('Take a look at the data table.')
|
86 |
+
st.dataframe(df)
|
87 |
+
|
88 |
+
col1, col2 = st.columns(2)
|
89 |
+
|
90 |
+
with col1:
|
91 |
+
st.caption('This dataset captures detailed information about patients suffering from various lung conditions. It includes:')
|
92 |
+
st.caption('**Age & Gender**: Patient demographics to understand the spread across age groups and gender.')
|
93 |
+
st.caption('**Smoking Status**: Whether the patient is a smoker or non-smoker.')
|
94 |
+
st.caption('**Lung Capacity**: Measured lung function to assess disease severity.')
|
95 |
+
st.caption('**Disease Type**: The specific lung condition, like COPD or Bronchitis.')
|
96 |
+
|
97 |
+
with col2:
|
98 |
+
st.caption('**Treatment Type**: Different treatments patients received, including therapy, medication, or surgery.')
|
99 |
+
st.caption('**Hospital Visits**: Number of visits to the hospital for managing the condition.')
|
100 |
+
st.caption('**Recovery Status**: Indicates whether the patient recovered after treatment.')
|
101 |
+
|
102 |
+
url = 'https://www.kaggle.com/datasets/samikshadalvi/lungs-diseases-dataset'
|
103 |
+
st.caption('For more details, check out the the original [source](%s) of the dataset.' % url)
|
104 |
+
|
105 |
+
with pred_tab:
|
106 |
+
st.header('Prediction Tab')
|
107 |
+
st.caption('In this tab, our ML models will predict if you will recover based on your data.')
|
108 |
+
|
109 |
+
models = load_models()
|
110 |
+
|
111 |
+
model = st.selectbox('Select preferred model for prediction', models.keys())
|
112 |
+
model_predictor = models[model]
|
113 |
+
|
114 |
+
col1, col2 = st.columns(2)
|
115 |
+
|
116 |
+
|
117 |
+
with col1:
|
118 |
+
# age
|
119 |
+
age = st.number_input('What is your age?', min_value=0, max_value=100)
|
120 |
+
gender = st.radio('What is your gender?', df['Gender'].unique())
|
121 |
+
disease = st.selectbox('What is your lung condition?', df['Disease Type'].unique())
|
122 |
+
treatment = st.selectbox('Which treatment did you receive?', df['Treatment Type'].unique())
|
123 |
+
|
124 |
+
with col2:
|
125 |
+
visits = st.number_input('How many times do you visit the hospital? (Annually)', min_value=0, max_value=365)
|
126 |
+
capacity = st.slider('What is your lung capacity?', min_value=1.00, max_value=df['Lung Capacity'].max()+5)
|
127 |
+
smoke = st.radio('Do you smoke?', ['Yes', 'No'])
|
128 |
+
|
129 |
+
if st.button('Predict!'):
|
130 |
+
pred = prediction(model_predictor, age, gender, smoke, capacity, disease, treatment, visits)
|
131 |
+
rec = 'Recovered!' if pred == 1 else 'I am sorry.'
|
132 |
+
st.header(rec)
|
133 |
+
|
134 |
+
with data_viz:
|
135 |
+
st.title('Data Viz Tab')
|
136 |
+
st.caption('In this tab, we can visualize the relationships among our data.')
|
137 |
+
st.caption('See our pre-existing plots and you can also plot your own!')
|
138 |
+
|
139 |
+
dviz_tab1, dviz_tab2 = st.tabs(['Plots', 'Custom Plot'])
|
140 |
+
|
141 |
+
with dviz_tab1:
|
142 |
+
st.title('Feature Distribution and Relationships')
|
143 |
+
st.caption('In this tab we will see the feature distributions of the dataset.')
|
144 |
+
st.caption('We can see the relationships of the features among each other.')
|
145 |
+
|
146 |
+
# Create subplots
|
147 |
+
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(15, 25))
|
148 |
+
|
149 |
+
# Age distribution
|
150 |
+
axes[0, 0].hist(df['Age'])
|
151 |
+
axes[0, 0].set_xlabel('Age')
|
152 |
+
axes[0, 0].set_ylabel('Frequency')
|
153 |
+
axes[0, 0].set_title('Age Distribution')
|
154 |
+
|
155 |
+
# Lung Capacity distribution
|
156 |
+
axes[0, 1].hist(df['Lung Capacity'])
|
157 |
+
axes[0, 1].set_xlabel('Lung Capacity')
|
158 |
+
axes[0, 1].set_ylabel('Frequency')
|
159 |
+
axes[0, 1].set_title('Lung Capacity Distribution')
|
160 |
+
|
161 |
+
# Hospital Visits distribution
|
162 |
+
axes[1, 0].hist(df['Hospital Visits'])
|
163 |
+
axes[1, 0].set_xlabel('Hospital Visits')
|
164 |
+
axes[1, 0].set_ylabel('Frequency')
|
165 |
+
axes[1, 0].set_title('Hospital Visits Distribution')
|
166 |
+
|
167 |
+
# Gender vs Recovered
|
168 |
+
count_data = df.groupby(['Gender', 'Recovered']).size().unstack(fill_value=0)
|
169 |
+
count_data.plot(kind='bar', stacked=False, ax=axes[1, 1])
|
170 |
+
axes[1, 1].set_xlabel('Gender')
|
171 |
+
axes[1, 1].set_ylabel('Count')
|
172 |
+
axes[1, 1].set_title('Gender Count by Recovery')
|
173 |
+
axes[1, 1].legend(title='Recovered')
|
174 |
+
|
175 |
+
# Smoking vs Recovered
|
176 |
+
count_data = df.groupby(['Smoking Status', 'Recovered']).size().unstack(fill_value=0)
|
177 |
+
count_data.plot(kind='bar', stacked=False, ax=axes[2, 0])
|
178 |
+
axes[2, 0].set_xlabel('Smoking Status')
|
179 |
+
axes[2, 0].set_ylabel('Count')
|
180 |
+
axes[2, 0].set_title('Smoking Status by Recovery')
|
181 |
+
axes[2, 0].legend(title='Recovered')
|
182 |
+
|
183 |
+
# Disease Type vs Recovered
|
184 |
+
count_data = df.groupby(['Disease Type', 'Recovered']).size().unstack(fill_value=0)
|
185 |
+
count_data.plot(kind='bar', stacked=False, ax=axes[2, 1])
|
186 |
+
axes[2, 1].set_xlabel('Disease Type')
|
187 |
+
axes[2, 1].set_ylabel('Count')
|
188 |
+
axes[2, 1].set_title('Disease Type by Recovery')
|
189 |
+
axes[2, 1].legend(title='Recovered')
|
190 |
+
|
191 |
+
# Treatment Type vs Recovered
|
192 |
+
count_data = df.groupby(['Treatment Type', 'Recovered']).size().unstack(fill_value=0)
|
193 |
+
count_data.plot(kind='bar', stacked=False, ax=axes[3, 0])
|
194 |
+
axes[3, 0].set_xlabel('Treatment Type')
|
195 |
+
axes[3, 0].set_ylabel('Count')
|
196 |
+
axes[3, 0].set_title('Treatment Type by Recovery')
|
197 |
+
axes[3, 0].legend(title='Recovered')
|
198 |
+
|
199 |
+
# Disease Type vs Treatment Type
|
200 |
+
count_data = df.groupby(['Disease Type', 'Treatment Type']).size().unstack(fill_value=0)
|
201 |
+
count_data.plot(kind='bar', stacked=False, ax=axes[3, 1])
|
202 |
+
axes[3, 1].set_xlabel('Disease Type')
|
203 |
+
axes[3, 1].set_ylabel('Count')
|
204 |
+
axes[3, 1].set_title('Disease Type by Treatment Type')
|
205 |
+
axes[3, 1].legend(title='Treatment')
|
206 |
+
|
207 |
+
st.pyplot(fig)
|
208 |
+
plt.tight_layout()
|
209 |
+
|
210 |
+
with dviz_tab2:
|
211 |
+
x = st.selectbox("Choose X for plotting.", tuple(df.columns))
|
212 |
+
y = st.selectbox("Choose Y for plotting.", tuple(df.drop(x, axis=1).columns))
|
213 |
+
|
214 |
+
plot = st.selectbox("Select type of plot.", ("Scatter", "Bar", "Line"))
|
215 |
+
|
216 |
+
if st.button("Plot X and Y!"):
|
217 |
+
if plot == "Scatter":
|
218 |
+
st.scatter_chart(
|
219 |
+
data=df,
|
220 |
+
x=x,
|
221 |
+
y=y,
|
222 |
+
size='Recovered'
|
223 |
+
)
|
224 |
+
elif plot == "Bar":
|
225 |
+
st.bar_chart(
|
226 |
+
data=df,
|
227 |
+
x=x,
|
228 |
+
y=y
|
229 |
+
)
|
230 |
+
elif plot == "Line":
|
231 |
+
st.line_chart(
|
232 |
+
data=df,
|
233 |
+
x=x,
|
234 |
+
y=y
|
235 |
+
)
|
236 |
+
|
237 |
+
if __name__ == "__main__":
|
238 |
+
main()
|
model.ipynb
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
models/LogisticRegression.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 975
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ede5d2d39b97307cf0645d445fd88119eb980d8d850626eb92cb31945bab01f2
|
3 |
size 975
|
models/RandomForests.pkl
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7a94e320fd2a2a93f908b6dfc63d3066e86eb4682298d9b3e279a951a2f7fd3
|
3 |
+
size 46939833
|