File size: 10,159 Bytes
583b3ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
680da52
583b3ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
import streamlit as st
import pandas as pd
from joblib import load
import numpy as np
import matplotlib.pyplot as plt

@st.cache_resource
def load_data() -> pd.DataFrame:
    """
    Loads the `.csv` data using pandas
    """
    df = pd.read_csv('./lung_disease_data.csv')
    
    numerical_columns = ['Age', 'Lung Capacity', 'Hospital Visits']
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # Impute categorical features with their mode (most frequent value)
    categorical_columns = ['Gender', 'Smoking Status', 'Disease Type', 'Treatment Type', 'Recovered']
    df[categorical_columns] = df[categorical_columns].fillna(df[categorical_columns].mode().iloc[0])
    
    return df

@st.cache_resource
def load_models() -> dict:
    """
    Loads the trained models for prediction.
    """
    nb = load('./models/GaussianNB.pkl')
    lg = load('./models/LogisticRegression.pkl')
    rf = load('./models/RandomForests.pkl')
    svm = load('./models/SVM.pkl')
    xgb = load('./models/XGBoost.pkl')
    
    models = dict({
        'Gaussian Naive Bayes': nb,
        'Logistic Regression': lg,
        'Random Forest': rf,
        'Support Vector Machines': svm,
        'XG Boost': xgb
    })
    
    return models

def prediction(model, age: int, gender: str,
               smoke_status: str, lung_capacity: float,
               disease_type: str, treatment_type: str,
               hospital_visits: int
               ) -> int:
    
    df_input = pd.DataFrame(
        {'Age': [age],
         'Hospital Visits': [hospital_visits],
         'Lung Capacity': [lung_capacity],
         'Gender': [1 if gender == "Male" else 0],
         'Smoking Status': [1 if smoke_status == "Yes" else 0],
         'Disease Type_Asthma': [1 if disease_type in 'Disease Type_Asthma' else 0],
         'Disease Type_Bronchitis': [1 if disease_type in 'Disease Type_Bronchitis' else 0],
        'Disease Type_COPD': [1 if disease_type in 'Disease Type_COPD' else 0],
        'Disease Type_Lung Cancer': [1 if disease_type in 'Disease Type_Lung Cancer' else 0],
        'Disease Type_Pneumonia': [1 if disease_type in 'Disease Type_Pneumonia' else 0],
        
        'Treatment Type_Medication': [1 if treatment_type in 'Treatment Type_Medication' else 0],
        'Treatment Type_Surgery': [1 if treatment_type in 'Treatment Type_Surgery' else 0],
        'Treatment Type_Therapy': [1 if treatment_type in 'Treatment Type_Therapy' else 0]
         }
    )
    
    input_arr = np.array(df_input)
    
    prediction = model.predict(input_arr)[0]
    
    return prediction.item()

def main():
    st.header("Lung Disease Recovery Predictor")
    st.caption('Prepared by `hydraadra112` | John Manuel Carado')
    
    data_tab, pred_tab, data_viz = st.tabs(['About Data', 'Prediction', 'Data Viz'])
    df = load_data()
    
    with data_tab:
        st.header('About the Data')
        st.caption('In this tab, we will explore the particular details about our data.')
        
        st.caption('Take a look at the data table.')
        st.dataframe(df)
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.caption('This dataset captures detailed information about patients suffering from various lung conditions. It includes:')
            st.caption('**Age & Gender**: Patient demographics to understand the spread across age groups and gender.')
            st.caption('**Smoking Status**: Whether the patient is a smoker or non-smoker.')
            st.caption('**Lung Capacity**: Measured lung function to assess disease severity.')
        st.caption('**Disease Type**: The specific lung condition, like COPD or Bronchitis.')
        
        with col2:
            st.caption('**Treatment Type**: Different treatments patients received, including therapy, medication, or surgery.')
            st.caption('**Hospital Visits**: Number of visits to the hospital for managing the condition.')
            st.caption('**Recovery Status**: Indicates whether the patient recovered after treatment.')
        
        url = 'https://www.kaggle.com/datasets/samikshadalvi/lungs-diseases-dataset'
        st.caption('For more details, check out the the original [source](%s) of the dataset.' % url)
    
    with pred_tab:
        st.header('Prediction Tab')
        st.caption('In this tab, our ML models will predict if you will recover based on your data.')
        
        models = load_models()
        
        model = st.selectbox('Select preferred model for prediction', models.keys())
        model_predictor = models[model]
        
        col1, col2 = st.columns(2)        
        
        
        with col1:
            # age
            age = st.number_input('What is your age?', min_value=0, max_value=100)
            gender = st.radio('What is your gender?', df['Gender'].unique())
            disease = st.selectbox('What is your lung condition?', df['Disease Type'].unique())
            treatment = st.selectbox('Which treatment did you receive?', df['Treatment Type'].unique())
        
        with col2:
            visits = st.number_input('How many times do you visit the hospital? (Annually)', min_value=0, max_value=365)
            capacity = st.slider('What is your lung capacity?', min_value=1.00, max_value=df['Lung Capacity'].max()+5)
            smoke = st.radio('Do you smoke?', ['Yes', 'No'])

        if st.button('Predict!'):
            pred = prediction(model_predictor, age, gender, smoke, capacity, disease, treatment, visits)
            rec = 'Recovered!' if pred == 1 else 'I am sorry.'
            st.header(rec)
            
    with data_viz:
        st.title('Data Viz Tab')
        st.caption('In this tab, we can visualize the relationships among our data.')
        st.caption('See our pre-existing plots and you can also plot your own!')
        
        dviz_tab1, dviz_tab2 = st.tabs(['Plots', 'Custom Plot'])
        
        with dviz_tab1:
            st.title('Feature Distribution and Relationships')
            st.caption('In this tab we will see the feature distributions of the dataset.')
            st.caption('We can see the relationships of the features among each other.')

            # Create subplots
            fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(15, 25))

            # Age distribution
            axes[0, 0].hist(df['Age'])
            axes[0, 0].set_xlabel('Age')
            axes[0, 0].set_ylabel('Frequency')
            axes[0, 0].set_title('Age Distribution')

            # Lung Capacity distribution
            axes[0, 1].hist(df['Lung Capacity'])
            axes[0, 1].set_xlabel('Lung Capacity')
            axes[0, 1].set_ylabel('Frequency')
            axes[0, 1].set_title('Lung Capacity Distribution')

            # Hospital Visits distribution
            axes[1, 0].hist(df['Hospital Visits'])
            axes[1, 0].set_xlabel('Hospital Visits')
            axes[1, 0].set_ylabel('Frequency')
            axes[1, 0].set_title('Hospital Visits Distribution')

            # Gender vs Recovered
            count_data = df.groupby(['Gender', 'Recovered']).size().unstack(fill_value=0)
            count_data.plot(kind='bar', stacked=False, ax=axes[1, 1])
            axes[1, 1].set_xlabel('Gender')
            axes[1, 1].set_ylabel('Count')
            axes[1, 1].set_title('Gender Count by Recovery')
            axes[1, 1].legend(title='Recovered')

            # Smoking vs Recovered
            count_data = df.groupby(['Smoking Status', 'Recovered']).size().unstack(fill_value=0)
            count_data.plot(kind='bar', stacked=False, ax=axes[2, 0])
            axes[2, 0].set_xlabel('Smoking Status')
            axes[2, 0].set_ylabel('Count')
            axes[2, 0].set_title('Smoking Status by Recovery')
            axes[2, 0].legend(title='Recovered')

            # Disease Type vs Recovered
            count_data = df.groupby(['Disease Type', 'Recovered']).size().unstack(fill_value=0)
            count_data.plot(kind='bar', stacked=False, ax=axes[2, 1])
            axes[2, 1].set_xlabel('Disease Type')
            axes[2, 1].set_ylabel('Count')
            axes[2, 1].set_title('Disease Type by Recovery')
            axes[2, 1].legend(title='Recovered')

            # Treatment Type vs Recovered
            count_data = df.groupby(['Treatment Type', 'Recovered']).size().unstack(fill_value=0)
            count_data.plot(kind='bar', stacked=False, ax=axes[3, 0])
            axes[3, 0].set_xlabel('Treatment Type')
            axes[3, 0].set_ylabel('Count')
            axes[3, 0].set_title('Treatment Type by Recovery')
            axes[3, 0].legend(title='Recovered')

            # Disease Type vs Treatment Type
            count_data = df.groupby(['Disease Type', 'Treatment Type']).size().unstack(fill_value=0)
            count_data.plot(kind='bar', stacked=False, ax=axes[3, 1])
            axes[3, 1].set_xlabel('Disease Type')
            axes[3, 1].set_ylabel('Count')
            axes[3, 1].set_title('Disease Type by Treatment Type')
            axes[3, 1].legend(title='Treatment')
            
            st.pyplot(fig)
            plt.tight_layout()

        with dviz_tab2:
            x = st.selectbox("Choose X for plotting.", tuple(df.columns))
            y = st.selectbox("Choose Y for plotting.", tuple(df.drop(x, axis=1).columns))
            
            plot = st.selectbox("Select type of plot.", ("Scatter", "Bar", "Line"))
                
            if st.button("Plot X and Y!"):
                if plot == "Scatter":
                    st.scatter_chart(
                        data=df,
                        x=x,
                        y=y,
                        size='Recovered'
                    )
                elif plot == "Bar":
                    st.bar_chart(
                        data=df,
                        x=x,
                        y=y
                    )
                elif plot == "Line":
                    st.line_chart(
                        data=df,
                        x=x,
                        y=y
                        )
    
if __name__ == "__main__":
    main()