Final commit
Browse files- AIDS_Classification_50000.csv +0 -0
- eda.py +94 -147
- model_svc.pkl +3 -0
- prediction.py +3 -37
AIDS_Classification_50000.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
eda.py
CHANGED
@@ -2,170 +2,117 @@ import streamlit as st
|
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
|
5 |
-
# This cell is used to make new column called 'group_age'
|
6 |
-
def filter_group_age(age):
|
7 |
-
if 0 <= age <= 14:
|
8 |
-
return "children"
|
9 |
-
elif 15 <= age <= 24:
|
10 |
-
return "youth"
|
11 |
-
elif 25 <= age <= 64:
|
12 |
-
return "adults"
|
13 |
-
elif age >= 65:
|
14 |
-
return "seniors"
|
15 |
-
else:
|
16 |
-
return "unknown"
|
17 |
-
|
18 |
-
|
19 |
|
20 |
def run():
|
21 |
-
st.title('
|
22 |
-
st.image('https://
|
23 |
st.write('This page is made by Yudis Aditya')
|
24 |
st.markdown('---')
|
25 |
|
26 |
-
|
27 |
-
|
28 |
|
29 |
|
30 |
-
|
31 |
-
# df['group_age'] = df['age'].apply(filter_group_age)
|
32 |
|
33 |
-
|
34 |
-
|
35 |
# # This cell is used to create histogram column age
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
# st.write('- 0 - 14 = children ')
|
47 |
-
# st.write('- 15 - 24 = youth')
|
48 |
-
# st.write('- 25 - 64 = adults ')
|
49 |
-
# st.write('- more than equal 65 = seniors')
|
50 |
-
|
51 |
-
|
52 |
-
# data = df.groupby('group_age').size()
|
53 |
-
# x = data.index
|
54 |
-
# y = data.values
|
55 |
-
# fig = plt.figure(figsize=(15,5))
|
56 |
-
# plt.pie(y,labels=x,autopct='%1.1f%%')
|
57 |
-
# plt.title("Pie Chart Data based on Group Age")
|
58 |
-
# st.pyplot(plt)
|
59 |
|
60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
61 |
|
62 |
|
63 |
-
|
64 |
-
|
65 |
-
#
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
|
76 |
-
|
77 |
-
|
78 |
-
#
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
|
89 |
-
#
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
# 'Apr':data_1.values[0],
|
102 |
-
# 'May':data_2.values[0],
|
103 |
-
# 'Jun':data_3.values[0],
|
104 |
-
# 'July':data_4.values[0],
|
105 |
-
# 'August':data_5.values[0],
|
106 |
-
# 'Sep':data_6.values[0],
|
107 |
-
# }
|
108 |
-
|
109 |
-
# data_visualize = pd.DataFrame([data_input])
|
110 |
-
# data_visualize.values.tolist()[0]
|
111 |
-
|
112 |
-
# # print(data)
|
113 |
-
# x = data_visualize.columns.to_list()
|
114 |
-
# y = data_visualize.values.tolist()[0]
|
115 |
-
|
116 |
-
# plt.plot(x,y)
|
117 |
-
# plt.title("Total status payment duly in 2005")
|
118 |
-
# st.pyplot(fig)
|
119 |
-
# st.write('From Graph above we can know that Total people who pay duly are increased significant from April 2005 until September 2005. It indicate that client has good behaviour to pay their bill, so the way to selection client who can have credit card is already good.')
|
120 |
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
# 'Apr':data_1.values[0],
|
133 |
-
# 'May':data_2.values[0],
|
134 |
-
# 'Jun':data_3.values[0],
|
135 |
-
# 'July':data_4.values[0],
|
136 |
-
# 'August':data_5.values[0],
|
137 |
-
# 'Sep':data_6.values[0],
|
138 |
-
# }
|
139 |
-
|
140 |
-
# data_visualize = pd.DataFrame([data_input])
|
141 |
-
# data_visualize.values.tolist()[0]
|
142 |
-
|
143 |
-
# # print(data)
|
144 |
-
# x = data_visualize.columns.to_list()
|
145 |
-
# y = data_visualize.values.tolist()[0]
|
146 |
-
|
147 |
-
# plt.bar(x,y)
|
148 |
-
# plt.title("Median Bill amount from April - September 2005")
|
149 |
-
# st.pyplot(fig)
|
150 |
-
# st.write('From graph above i use median instead mean because distribution data is not normal. We can see that median bill amount from april until September is decrased. That visualize that most people has decrased bill amount every month, which is good. ')
|
151 |
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
# st.markdown('---')
|
165 |
-
# st.write('# Conclusion')
|
166 |
-
# st.write('From analysis and creating visualize from my dataset , This is a important point that i can share:')
|
167 |
-
# st.write("- For promotion , don't target for group age old and young. And we can make deal promotion with product that focus on increase status life style like accesoris, outfit, etc using credit card. Also deal promotion with product that related with woman like makeup, salon , etc.")
|
168 |
-
# st.write('- For April until September 2005, we can see that our client summarize has good behavior because dominant pay dully and decrase bill amount for every month. It is indicate the process selection client who can use credit card is good.')
|
169 |
|
170 |
if __name__ == '__main__':
|
171 |
run()
|
|
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
|
6 |
def run():
|
7 |
+
st.title('AIDS Virus Infection Exploration Data Analyst(EDA)')
|
8 |
+
st.image('https://images.theconversation.com/files/404868/original/file-20210607-28173-1rw95dw.jpg?ixlib=rb-4.1.0&q=45&auto=format&w=926&fit=clip')
|
9 |
st.write('This page is made by Yudis Aditya')
|
10 |
st.markdown('---')
|
11 |
|
12 |
+
st.write('In this page, I want to show visualization data based on my analyst process about AIDS Infection. This EDA has information about people characteristic and people behaviour')
|
13 |
+
st.link_button('Link dataset','https://www.kaggle.com/datasets/aadarshvelu/aids-virus-infection-prediction?select=AIDS_Classification_5000.csv')
|
14 |
|
15 |
|
16 |
+
df = pd.read_csv('AIDS_Classification_50000.csv')
|
|
|
17 |
|
18 |
+
st.write('## 1. People Characteristic')
|
19 |
+
st.write('### 1.1 Distribution Data Age')
|
20 |
# # This cell is used to create histogram column age
|
21 |
+
data = df.groupby(['infected']).size()
|
22 |
+
|
23 |
+
x = ["Not Infected", "Infected"]
|
24 |
+
y = data.values
|
25 |
+
fig = plt.figure(figsize=(15,5))
|
26 |
+
plt.pie(y,labels=x,autopct='%1.0f%%')
|
27 |
+
plt.title("Comparison people Not Infected and Infected By AIDS")
|
28 |
+
plt.show()
|
29 |
+
st.pyplot(fig)
|
30 |
+
st.write("From Graph above we can know that in my dataset people dominant not infected by AIDS. It is indicate that people already aware about danger AIDS and try to take therapy to prevent it before it's too late.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
st.write('### 1.2 Distribution data people infected and not infected based on age')
|
33 |
+
fig = plt.figure(figsize=(15,5))
|
34 |
+
# This cell code is used to create histogram for visualize distribution data people infected and not infected based on age
|
35 |
+
data_0 = df[df['infected'] == 0]['age']
|
36 |
+
data_1 = df[df['infected'] == 1]['age']
|
37 |
+
|
38 |
+
plt.title("Distribution data people infected and not infected based on age")
|
39 |
+
plt.hist(data_0)
|
40 |
+
plt.hist(data_1)
|
41 |
+
plt.show()
|
42 |
+
st.pyplot(plt)
|
43 |
+
st.write('From Graph above we can know distribution data people infected and not infected based on age has same pattern and dominant in range age 25 - 45. From this data, we can know people got infected AIDS or HIV when people is already teenager or adult. Goverment can make campaign for danger and anticipate HIV and AIDS around that age so the campaign can be more effective than target campaign to child or elder.')
|
44 |
|
45 |
|
46 |
+
st.write('### 1.3 Percentage Homosexual and Heterosexual who have HIV')
|
47 |
+
fig = plt.figure(figsize=(15,5))
|
48 |
+
# This cell code is used to show pie chart percentage Homosexual and Heterosexual
|
49 |
+
data = df.groupby('homo').size()
|
50 |
+
label = ['Heterosexual','Homosexual']
|
51 |
+
values = data.values
|
52 |
+
|
53 |
+
plt.pie(values,labels=label,autopct="%.2f")
|
54 |
+
plt.title("Percentage Homosexual and Heterosexual who have HIV")
|
55 |
+
plt.show()
|
56 |
+
st.pyplot(fig)
|
57 |
+
st.write('From Graph above we can know that people total people who is homosexual that has HIV is more than Heterosexual. From my assumption this can be happened because they dont have to worry because somoene dont have to be preagnant. So with this data , i hope we can give this information to people who has homosexual activity that it has another bad impact beside that.')
|
58 |
|
59 |
+
st.write('### 1.4 Distribution data people infected and not infected based on cd40')
|
60 |
+
fig = plt.figure(figsize=(15,5))
|
61 |
+
# This cell code is used to create histogram for visualize distribution data people infected and not infected based on age
|
62 |
+
data_0 = df[df['infected'] == 0]['cd40']
|
63 |
+
data_1 = df[df['infected'] == 1]['cd40']
|
64 |
+
|
65 |
+
plt.title("Distribution data people infected and not infected based on cd40")
|
66 |
+
plt.hist(data_0)
|
67 |
+
plt.hist(data_1)
|
68 |
+
plt.show()
|
69 |
+
st.pyplot(fig)
|
70 |
+
st.write('Cd40 is used as important metrix to determine people immunity who has HIV. Based graph above, we can know that people who has AIDS has the most CD40 arount 180 - 300. This can be warning area for every who has HIV to be more careful if has CD40 almost at that point. ')
|
71 |
|
72 |
+
# This cell code is used to create histogram for visualize distribution data people infected and not infected based on age
|
73 |
+
st.write('### 1.5 Distribution data people infected and not infected based on cd80')
|
74 |
+
fig = plt.figure(figsize=(15,5))
|
75 |
+
data_0 = df[df['infected'] == 0]['cd80']
|
76 |
+
data_1 = df[df['infected'] == 1]['cd80']
|
77 |
+
|
78 |
+
plt.title("Distribution data people infected and not infected based on cd80")
|
79 |
+
plt.hist(data_0)
|
80 |
+
plt.hist(data_1)
|
81 |
+
plt.show()
|
82 |
+
st.pyplot(fig)
|
83 |
+
st.write('CD80 is a transmembrane protein that functions as a co-stimulatory molecule in antigen-presenting cells, such as dendritic cells, macrophages, and B cells. Based graph above, we can know that people who has AIDS has the most CD80 arount 800 - 1200. This can be warning area for every who has HIV to be more careful if has CD80 almost at that point. ')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
+
st.write('### 1.6 Percentage Race White and Non-White')
|
86 |
+
fig = plt.figure(figsize=(15,5))
|
87 |
+
data = df.groupby('race').size()
|
88 |
+
label = ['White','Non-White']
|
89 |
+
values = data.values
|
90 |
+
|
91 |
+
plt.pie(values,labels=label,autopct="%.2f")
|
92 |
+
plt.title("Percentage Race White and Non-White")
|
93 |
+
plt.show()
|
94 |
+
st.pyplot(fig)
|
95 |
+
st.write('From Graph above we can know that total people with race white who has HIV is more than non-white. This can be happening by many factor, based on race white beahviour, tradition, believes, habit, etc. So my suggestion is to create film or another socialize method to more race white people aware about HIV and how to prevent it. ')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
+
st.write('## 2. People Behaviour')
|
98 |
+
st.write('### 2.1 Distribution Data patient who has aids based on their treatment type')
|
99 |
+
fig = plt.figure(figsize=(15,5))
|
100 |
+
# This cell is used to create bar horizontal for distribution data who has AIDS based on treatment type
|
101 |
+
data = df[df['infected'] == 1].groupby('trt').size()
|
102 |
+
labels = ['ZDV only','ZDV + ddl','ZDV + Zal','ddl only']
|
103 |
+
values = data.values
|
104 |
+
|
105 |
+
plt.barh(labels,values)
|
106 |
+
plt.title("Distribution Data patient who has aids based on their treatment type")
|
107 |
+
plt.show()
|
108 |
+
st.pyplot(fig)
|
109 |
+
st.write("Based on graph above we can know the most people who has AIDS is only take ZDV treatment and ZDV + ddl is the lowest. I can assume that goverment can remove treatment for only ZDV only and increase treatment for ZDV + ddl for increase number suvivor")
|
110 |
|
111 |
+
st.markdown('---')
|
112 |
+
st.write('# Conclusion')
|
113 |
+
st.write('From analysis and creating visualize from my dataset , This is a important point that i can share:')
|
114 |
+
st.write("- The most effective treatment before AIDS is ZDV + ddl")
|
115 |
+
st.write('- People who has HIV is around 25 - 45 years old. Dominant who has experience homosexual activity or white race. Has benchmark cd40 for get AIDS around 180 - 300 and cd80 around 800 - 1200.')
|
|
|
|
|
|
|
|
|
|
|
116 |
|
117 |
if __name__ == '__main__':
|
118 |
run()
|
model_svc.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:10c695c95da3ddbfbd333d39930f300b0b5fd17f28e5ee6451dcb12373fb04b9
|
3 |
+
size 3701911
|
prediction.py
CHANGED
@@ -9,8 +9,8 @@ with open('list_col_num.txt', 'r') as file_1:
|
|
9 |
with open('list_col_cat.txt', 'r') as file_2:
|
10 |
list_col_cat_if = json.load(file_2)
|
11 |
|
12 |
-
with open('
|
13 |
-
|
14 |
|
15 |
def get_trt_index(trt):
|
16 |
if trt == 'ZDV only':
|
@@ -91,40 +91,6 @@ def run():
|
|
91 |
cd420 = st.number_input('Input CD4 at 20+/-5 weeks', value=40.0)
|
92 |
cd80 = st.number_input('Input CD8', value=40.0)
|
93 |
cd820 = st.number_input('Input CD8 at 20+/-5 weeks', value=40.0)
|
94 |
-
# limit_balance = st.number_input('Input limit balance', value=10000.0)
|
95 |
-
# sex = st.selectbox('Gender', {'Male','Female'},index=0)
|
96 |
-
|
97 |
-
# marital_status = st.selectbox('Marital Status ', {'Married','Single','Others'},index=0)
|
98 |
-
# age = st.number_input('Age', value=20)
|
99 |
-
|
100 |
-
# st.markdown('---')
|
101 |
-
# st.write('Repayment Status')
|
102 |
-
|
103 |
-
# pay_1 = st.selectbox('RePayment Status September 2005 ', {'Pay Duly','Payment delay for one month','Payment delay for two month','Payment delay for three month','Payment delay for four month','Payment delay for five month','Payment delay for six month','Payment delay for seven month','Payment delay for eight month','Payment delay for nine month'},index=0)
|
104 |
-
# pay_2 = st.selectbox('RePayment Status August 2005 ', {'Pay Duly','Payment delay for one month','Payment delay for two month','Payment delay for three month','Payment delay for four month','Payment delay for five month','Payment delay for six month','Payment delay for seven month','Payment delay for eight month','Payment delay for nine month'},index=0)
|
105 |
-
# pay_3 = st.selectbox('RePayment Status July 2005 ', {'Pay Duly','Payment delay for one month','Payment delay for two month','Payment delay for three month','Payment delay for four month','Payment delay for five month','Payment delay for six month','Payment delay for seven month','Payment delay for eight month','Payment delay for nine month'},index=0)
|
106 |
-
# pay_4 = st.selectbox('RePayment Status June 2005 ', {'Pay Duly','Payment delay for one month','Payment delay for two month','Payment delay for three month','Payment delay for four month','Payment delay for five month','Payment delay for six month','Payment delay for seven month','Payment delay for eight month','Payment delay for nine month'},index=0)
|
107 |
-
# pay_5 = st.selectbox('RePayment Status May 2005 ', {'Pay Duly','Payment delay for one month','Payment delay for two month','Payment delay for three month','Payment delay for four month','Payment delay for five month','Payment delay for six month','Payment delay for seven month','Payment delay for eight month','Payment delay for nine month'},index=0)
|
108 |
-
# pay_6 = st.selectbox('RePayment Status April 2005 ', {'Pay Duly','Payment delay for one month','Payment delay for two month','Payment delay for three month','Payment delay for four month','Payment delay for five month','Payment delay for six month','Payment delay for seven month','Payment delay for eight month','Payment delay for nine month'},index=0)
|
109 |
-
|
110 |
-
# st.markdown('---')
|
111 |
-
# st.write('Bill Amount')
|
112 |
-
# bill_amt_1 = st.number_input('Input Billing Amount September 2005 ', value=10000.0)
|
113 |
-
# bill_amt_2 = st.number_input('Input Billing Amount August 2005 ', value=10000.0)
|
114 |
-
# bill_amt_3 = st.number_input('Input Billing Amount July 2005 ', value=10000.0)
|
115 |
-
# bill_amt_4 = st.number_input('Input Billing Amount June 2005 ', value=10000.0)
|
116 |
-
# bill_amt_5 = st.number_input('Input Billing Amount May 2005 ', value=10000.0)
|
117 |
-
# bill_amt_6 = st.number_input('Input Billing Amount April 2005 ', value=10000.0)
|
118 |
-
|
119 |
-
# st.markdown('---')
|
120 |
-
# st.write('Amount Previous payment')
|
121 |
-
# pay_amt_1 = st.number_input('Amount of previous payment in September 2005 ', value=10000.0)
|
122 |
-
# pay_amt_2 = st.number_input('Amount of previous payment in August 2005 ', value=10000.0)
|
123 |
-
# pay_amt_3 = st.number_input('Amount of previous payment in July 2005 ', value=10000.0)
|
124 |
-
# pay_amt_4 = st.number_input('Amount of previous payment in June 2005 ', value=10000.0)
|
125 |
-
# pay_amt_5 = st.number_input('Amount of previous payment in May 2005 ', value=10000.0)
|
126 |
-
# pay_amt_6 = st.number_input('Amount of previous payment in April 2005 ', value=10000.0)
|
127 |
-
|
128 |
submitted = st.form_submit_button("Submit")
|
129 |
st.write("Outside the form")
|
130 |
|
@@ -159,7 +125,7 @@ def run():
|
|
159 |
df = df[list_col_cat_if + list_col_num_if]
|
160 |
|
161 |
# Do model predict from data input
|
162 |
-
predict_result =
|
163 |
if predict_result[0] == 1 :
|
164 |
predic_result_value = 'yes'
|
165 |
else:
|
|
|
9 |
with open('list_col_cat.txt', 'r') as file_2:
|
10 |
list_col_cat_if = json.load(file_2)
|
11 |
|
12 |
+
with open('model_svc.pkl', 'rb') as file_3:
|
13 |
+
model_svc = pickle.load(file_3)
|
14 |
|
15 |
def get_trt_index(trt):
|
16 |
if trt == 'ZDV only':
|
|
|
91 |
cd420 = st.number_input('Input CD4 at 20+/-5 weeks', value=40.0)
|
92 |
cd80 = st.number_input('Input CD8', value=40.0)
|
93 |
cd820 = st.number_input('Input CD8 at 20+/-5 weeks', value=40.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
94 |
submitted = st.form_submit_button("Submit")
|
95 |
st.write("Outside the form")
|
96 |
|
|
|
125 |
df = df[list_col_cat_if + list_col_num_if]
|
126 |
|
127 |
# Do model predict from data input
|
128 |
+
predict_result = model_svc.predict(df)
|
129 |
if predict_result[0] == 1 :
|
130 |
predic_result_value = 'yes'
|
131 |
else:
|