Browse files
@@ -0,0 +1,162 @@
1 |
import streamlit as st
2 |
import streamlit.components.v1 as components
3 |
import pandas as pd
4 |
import as px
5 |
6 |
df = pd.read_parquet('final_occ.parquet')
7 |
8 |
games = {'Birds of a feather': 'How do people choose their partner, based on their profession ?',
9 |
'Different strokes': 'How do people occupation differ, based on their circumstances ?',}
10 |
11 |
st.title('Play with the US Census data 2023')
12 |
game = st.selectbox('Pick your Game', list(games.keys()))
13 |
14 |
15 |
16 |
st.markdown('*(Source : 1% sample of 2023 US Census)*')
17 |
18 |
if game == 'Birds of a feather':
19 |
20 |
couple_types = sorted(df['couple_type'].value_counts().index)
21 |
couple_type = st.selectbox('Couple Type', couple_types)
22 |
23 |
filtered = df.copy()
24 |
filtered['id'] = 1
25 |
filtered['id'] = filtered['id'].cumsum()
26 |
27 |
mask = filtered['couple_type'] == couple_type
28 |
st.toast(f'couple_type = {couple_type} : {mask.mean():.1%}')
29 |
30 |
31 |
temp = filtered.copy()
32 |
temp['chosen'] = mask
33 |
34 |
35 |
filtered = filtered[mask]
36 |
st.subheader('Who is your protagonist ?')
37 |
from_genders = sorted(filtered['SEX'].unique())
38 |
39 |
from_gender = st.selectbox('Protagonist Gender', from_genders)
40 |
protagonists = filtered.copy()
41 |
42 |
mask = protagonists['SEX'] == from_gender
43 |
st.toast(f'protagonist gender = {from_gender} : {mask.mean():.1%}')
44 |
protagonists = protagonists[mask]
45 |
46 |
47 |
data_min,data_max = protagonists['AGE'].agg('min max'.split())
48 |
min_age,max_age= st.slider('Protagonist Age Range', data_min,data_max,(data_min,data_max))
49 |
mask = protagonists['AGE'] >= min_age
50 |
st.toast(f'protagonist age >={min_age} : {mask.mean():.1%}')
51 |
protagonists = protagonists[mask]
52 |
53 |
54 |
mask = protagonists['AGE'] <= max_age
55 |
st.toast(f'protagonist age <= {max_age} : {mask.mean():.1%}')
56 |
protagonists = protagonists[mask]
57 |
58 |
ages = protagonists.groupby('AGE')['HHWT'].sum().reset_index()
59 |
60 |
61 |
62 |
63 |
n_top_professions = st.slider('Only keep top N protagonist occupations',10,100,30)
64 |
keep_unemployed = st.checkbox('Keep unemployed protagonist')
65 |
top_professions = protagonists.groupby('OCC')['HHWT'].sum().sort_values(ascending=False)[:n_top_professions].reset_index()
66 |
if not keep_unemployed:
67 |
top_professions = top_professions[top_professions['OCC'] != 'Unemployed']
68 |
st.plotly_chart(, x='OCC', y='HHWT',height=800))
69 |
70 |
71 |
72 |
73 |
74 |
75 |
protagonists_ids = set(protagonists['id'].unique())
76 |
protagonists_house_ids = set(protagonists['CBSERIAL'].unique())
77 |
78 |
filtered = filtered[filtered['CBSERIAL'].isin(protagonists_house_ids)]
79 |
80 |
data = filtered[[ 'CBSERIAL', 'HHWT', 'OCC', 'id']]
81 |
data = pd.merge(data,data,on=['CBSERIAL', 'HHWT'],suffixes=('_protagonist','_partner'))
82 |
mask = data['id_protagonist'].isin(protagonists_ids)
83 |
data = data[mask]
84 |
mask = data['id_protagonist'] != data['id_partner']
85 |
data = data[mask]
86 |
87 |
data = data.groupby(['OCC_protagonist','OCC_partner'])['HHWT'].sum().rename('perc_partner').reset_index()
88 |
data = data[data['perc_partner'] > 0]
89 |
90 |
top_protagonist_occ = data.groupby('OCC_protagonist')['perc_partner'].sum().rename('total_protagonist').reset_index().sort_values('total_protagonist',ascending=False)
91 |
92 |
data = pd.merge(data,top_protagonist_occ,on=['OCC_protagonist'])
93 |
data['perc_partner'] /= data['total_protagonist']
94 |
95 |
filter_min_perc = 0.01
96 |
heatmap = data[data['perc_partner'] > filter_min_perc].copy()
97 |
98 |
st.subheader('Some insights')
99 |
same = data[data['OCC_partner'] == data['OCC_protagonist']]
100 |
n_top_protagonists = 10
101 |
102 |
st.plotly_chart('perc_partner',ascending=False)[:n_top_protagonists],x='OCC_protagonist',y='perc_partner',color='total_protagonist',title='Professions most commonly shared with the partner'))
103 |
st.plotly_chart('perc_partner',ascending=False)[-n_top_protagonists:],x='OCC_protagonist',y='perc_partner',color='total_protagonist',title='Professions least commonly shared with the partner'))
104 |
105 |
st.subheader('Pick the occupation of your protagonist')
106 |
contains = st.text_input('Filter occupations', '')
107 |
candidates = top_professions.copy()
108 |
if contains:
109 |
mask = candidates['OCC'].str.lower().str.contains(contains.lower())
110 |
candidates = candidates[mask]
111 |
protagonist_occupation = st.selectbox('Occupation',candidates['OCC'])
112 |
113 |
subset = heatmap[heatmap['OCC_protagonist'] == protagonist_occupation]
114 |
st.subheader(f'Distribution of partner occupations for protagonist occupation = {protagonist_occupation}')
115 |
fig = px.pie(subset, names="OCC_partner", values='perc_partner')
116 |
fig.update_traces(textposition='inside', textinfo='percent+label')
117 |
118 |
# fig.update(layout_coloraxis_showscale=False)
119 |
120 |
121 |
# fig.update_traces(showlegend=False)
122 |
# fig.update(layout_coloraxis_showscale=False)
123 |
124 |
125 |
data1 = df.copy()
126 |
data2 = df.copy()
127 |
128 |
st.write('Select the attribute to compare between the two groups')
129 |
col1, col2 = st.columns(2)
130 |
131 |
132 |
133 |
134 |
for col in ['couple_type', 'SEX', 'AGE', 'MARRNO']:
135 |
col1, col2 = st.columns(2)
136 |
if col != 'AGE':
137 |
choice1 = col1.selectbox(col, data1[col].unique(),key=f'{col}_1')
138 |
mask = data1[col] == choice1
139 |
data1 = data1[mask]
140 |
choice2 = col2.selectbox(col, data2[col].unique(),key=f'{col}_2')
141 |
mask = data2[col] == choice2
142 |
data2 = data2[mask]
143 |
144 |
choice1 = col1.slider(col, data1[col].min(), data1[col].max(), (data1[col].min(), data1[col].max()),key=f'{col}_1')
145 |
mask = (data1[col] >= choice1[0]) & (data1[col] <= choice1[1])
146 |
data1 = data1[mask]
147 |
choice2 = col2.slider(col, data2[col].min(), data2[col].max(), (data2[col].min(), data2[col].max()),key=f'{col}_2')
148 |
mask = (data2[col] >= choice2[0]) & (data2[col] <= choice2[1])
149 |
data2 = data2[mask]
150 |
151 |
summary1 = data1.groupby('OCC')['HHWT'].sum().reset_index()
152 |
summary1['HHWT'] /= summary1['HHWT'].sum()
153 |
summary2 = data2.groupby('OCC')['HHWT'].sum().reset_index()
154 |
summary2['HHWT'] /= summary2['HHWT'].sum()
155 |
comparison = pd.merge(summary1, summary2, on='OCC', suffixes=('_group1', '_group2'), how='outer')
156 |
157 |
158 |
comparison['diff'] = comparison['HHWT_group1'] - comparison['HHWT_group2']
159 |
comparison['abs_diff'] = comparison['diff'].abs()
160 |
comparison = comparison.sort_values('abs_diff', ascending=False)[:30]
161 |
st.plotly_chart('diff'), x='OCC', y='diff', color='diff', title='Occupation distribution difference between the two groups (group1 - group2)', height=800))
162 |