Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import streamlit.components.v1 as components
|
3 |
+
import pandas as pd
|
4 |
+
import plotly.express as px
|
5 |
+
|
6 |
+
df = pd.read_parquet('final_occ.parquet')
|
7 |
+
|
8 |
+
games = {'Birds of a feather': 'How do people choose their partner, based on their profession ?',
|
9 |
+
'Different strokes': 'How do people occupation differ, based on their circumstances ?',}
|
10 |
+
|
11 |
+
st.title('Play with the US Census data 2023')
|
12 |
+
game = st.selectbox('Pick your Game', list(games.keys()))
|
13 |
+
|
14 |
+
st.subheader(game)
|
15 |
+
st.markdown(games[game])
|
16 |
+
st.markdown('*(Source : 1% sample of 2023 US Census)*')
|
17 |
+
|
18 |
+
if game == 'Birds of a feather':
|
19 |
+
|
20 |
+
couple_types = sorted(df['couple_type'].value_counts().index)
|
21 |
+
couple_type = st.selectbox('Couple Type', couple_types)
|
22 |
+
|
23 |
+
filtered = df.copy()
|
24 |
+
filtered['id'] = 1
|
25 |
+
filtered['id'] = filtered['id'].cumsum()
|
26 |
+
|
27 |
+
mask = filtered['couple_type'] == couple_type
|
28 |
+
st.toast(f'couple_type = {couple_type} : {mask.mean():.1%}')
|
29 |
+
|
30 |
+
|
31 |
+
temp = filtered.copy()
|
32 |
+
temp['chosen'] = mask
|
33 |
+
|
34 |
+
|
35 |
+
filtered = filtered[mask]
|
36 |
+
st.subheader('Who is your protagonist ?')
|
37 |
+
from_genders = sorted(filtered['SEX'].unique())
|
38 |
+
|
39 |
+
from_gender = st.selectbox('Protagonist Gender', from_genders)
|
40 |
+
protagonists = filtered.copy()
|
41 |
+
|
42 |
+
mask = protagonists['SEX'] == from_gender
|
43 |
+
st.toast(f'protagonist gender = {from_gender} : {mask.mean():.1%}')
|
44 |
+
protagonists = protagonists[mask]
|
45 |
+
|
46 |
+
|
47 |
+
data_min,data_max = protagonists['AGE'].agg('min max'.split())
|
48 |
+
min_age,max_age= st.slider('Protagonist Age Range', data_min,data_max,(data_min,data_max))
|
49 |
+
mask = protagonists['AGE'] >= min_age
|
50 |
+
st.toast(f'protagonist age >={min_age} : {mask.mean():.1%}')
|
51 |
+
protagonists = protagonists[mask]
|
52 |
+
|
53 |
+
|
54 |
+
mask = protagonists['AGE'] <= max_age
|
55 |
+
st.toast(f'protagonist age <= {max_age} : {mask.mean():.1%}')
|
56 |
+
protagonists = protagonists[mask]
|
57 |
+
|
58 |
+
ages = protagonists.groupby('AGE')['HHWT'].sum().reset_index()
|
59 |
+
st.plotly_chart(px.bar(ages,x='AGE',y='HHWT'))
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
n_top_professions = st.slider('Only keep top N protagonist occupations',10,100,30)
|
64 |
+
keep_unemployed = st.checkbox('Keep unemployed protagonist')
|
65 |
+
top_professions = protagonists.groupby('OCC')['HHWT'].sum().sort_values(ascending=False)[:n_top_professions].reset_index()
|
66 |
+
if not keep_unemployed:
|
67 |
+
top_professions = top_professions[top_professions['OCC'] != 'Unemployed']
|
68 |
+
st.plotly_chart(px.bar(top_professions, x='OCC', y='HHWT',height=800))
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
|
73 |
+
|
74 |
+
|
75 |
+
protagonists_ids = set(protagonists['id'].unique())
|
76 |
+
protagonists_house_ids = set(protagonists['CBSERIAL'].unique())
|
77 |
+
|
78 |
+
filtered = filtered[filtered['CBSERIAL'].isin(protagonists_house_ids)]
|
79 |
+
|
80 |
+
data = filtered[[ 'CBSERIAL', 'HHWT', 'OCC', 'id']]
|
81 |
+
data = pd.merge(data,data,on=['CBSERIAL', 'HHWT'],suffixes=('_protagonist','_partner'))
|
82 |
+
mask = data['id_protagonist'].isin(protagonists_ids)
|
83 |
+
data = data[mask]
|
84 |
+
mask = data['id_protagonist'] != data['id_partner']
|
85 |
+
data = data[mask]
|
86 |
+
|
87 |
+
data = data.groupby(['OCC_protagonist','OCC_partner'])['HHWT'].sum().rename('perc_partner').reset_index()
|
88 |
+
data = data[data['perc_partner'] > 0]
|
89 |
+
|
90 |
+
top_protagonist_occ = data.groupby('OCC_protagonist')['perc_partner'].sum().rename('total_protagonist').reset_index().sort_values('total_protagonist',ascending=False)
|
91 |
+
|
92 |
+
data = pd.merge(data,top_protagonist_occ,on=['OCC_protagonist'])
|
93 |
+
data['perc_partner'] /= data['total_protagonist']
|
94 |
+
|
95 |
+
filter_min_perc = 0.01
|
96 |
+
heatmap = data[data['perc_partner'] > filter_min_perc].copy()
|
97 |
+
|
98 |
+
st.subheader('Some insights')
|
99 |
+
same = data[data['OCC_partner'] == data['OCC_protagonist']]
|
100 |
+
n_top_protagonists = 10
|
101 |
+
|
102 |
+
st.plotly_chart(px.bar(same.sort_values('perc_partner',ascending=False)[:n_top_protagonists],x='OCC_protagonist',y='perc_partner',color='total_protagonist',title='Professions most commonly shared with the partner'))
|
103 |
+
st.plotly_chart(px.bar(same.sort_values('perc_partner',ascending=False)[-n_top_protagonists:],x='OCC_protagonist',y='perc_partner',color='total_protagonist',title='Professions least commonly shared with the partner'))
|
104 |
+
|
105 |
+
st.subheader('Pick the occupation of your protagonist')
|
106 |
+
contains = st.text_input('Filter occupations', '')
|
107 |
+
candidates = top_professions.copy()
|
108 |
+
if contains:
|
109 |
+
mask = candidates['OCC'].str.lower().str.contains(contains.lower())
|
110 |
+
candidates = candidates[mask]
|
111 |
+
protagonist_occupation = st.selectbox('Occupation',candidates['OCC'])
|
112 |
+
|
113 |
+
subset = heatmap[heatmap['OCC_protagonist'] == protagonist_occupation]
|
114 |
+
st.subheader(f'Distribution of partner occupations for protagonist occupation = {protagonist_occupation}')
|
115 |
+
fig = px.pie(subset, names="OCC_partner", values='perc_partner')
|
116 |
+
fig.update_traces(textposition='inside', textinfo='percent+label')
|
117 |
+
fig.update_traces(showlegend=False)
|
118 |
+
# fig.update(layout_coloraxis_showscale=False)
|
119 |
+
|
120 |
+
st.plotly_chart(fig)
|
121 |
+
# fig.update_traces(showlegend=False)
|
122 |
+
# fig.update(layout_coloraxis_showscale=False)
|
123 |
+
|
124 |
+
else:
|
125 |
+
data1 = df.copy()
|
126 |
+
data2 = df.copy()
|
127 |
+
|
128 |
+
st.write('Select the attribute to compare between the two groups')
|
129 |
+
col1, col2 = st.columns(2)
|
130 |
+
col1.subheader('Group1')
|
131 |
+
col2.subheader('Group2')
|
132 |
+
|
133 |
+
|
134 |
+
for col in ['couple_type', 'SEX', 'AGE', 'MARRNO']:
|
135 |
+
col1, col2 = st.columns(2)
|
136 |
+
if col != 'AGE':
|
137 |
+
choice1 = col1.selectbox(col, data1[col].unique(),key=f'{col}_1')
|
138 |
+
mask = data1[col] == choice1
|
139 |
+
data1 = data1[mask]
|
140 |
+
choice2 = col2.selectbox(col, data2[col].unique(),key=f'{col}_2')
|
141 |
+
mask = data2[col] == choice2
|
142 |
+
data2 = data2[mask]
|
143 |
+
else:
|
144 |
+
choice1 = col1.slider(col, data1[col].min(), data1[col].max(), (data1[col].min(), data1[col].max()),key=f'{col}_1')
|
145 |
+
mask = (data1[col] >= choice1[0]) & (data1[col] <= choice1[1])
|
146 |
+
data1 = data1[mask]
|
147 |
+
choice2 = col2.slider(col, data2[col].min(), data2[col].max(), (data2[col].min(), data2[col].max()),key=f'{col}_2')
|
148 |
+
mask = (data2[col] >= choice2[0]) & (data2[col] <= choice2[1])
|
149 |
+
data2 = data2[mask]
|
150 |
+
|
151 |
+
summary1 = data1.groupby('OCC')['HHWT'].sum().reset_index()
|
152 |
+
summary1['HHWT'] /= summary1['HHWT'].sum()
|
153 |
+
summary2 = data2.groupby('OCC')['HHWT'].sum().reset_index()
|
154 |
+
summary2['HHWT'] /= summary2['HHWT'].sum()
|
155 |
+
comparison = pd.merge(summary1, summary2, on='OCC', suffixes=('_group1', '_group2'), how='outer')
|
156 |
+
comparison['HHWT_group1'].fillna(0,inplace=True)
|
157 |
+
comparison['HHWT_group2'].fillna(0,inplace=True)
|
158 |
+
comparison['diff'] = comparison['HHWT_group1'] - comparison['HHWT_group2']
|
159 |
+
comparison['abs_diff'] = comparison['diff'].abs()
|
160 |
+
comparison = comparison.sort_values('abs_diff', ascending=False)[:30]
|
161 |
+
st.plotly_chart(px.bar(comparison.sort_values('diff'), x='OCC', y='diff', color='diff', title='Occupation distribution difference between the two groups (group1 - group2)', height=800))
|
162 |
+
|