Spaces:
Sleeping
Sleeping
File size: 8,736 Bytes
a514b64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 |
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import streamlit as st
from datetime import datetime
from pprint import pprint
from scipy.stats import bootstrap
# Load data
with open('data.txt', 'r') as f:
cases_data = f.readlines()
monthly_records = []
annual_records = []
for case_count in cases_data:
data = case_count.split()
# Annual data
if len(data) == 2:
data[1] = data[1].replace('(', '').replace(')', '')
annual_records.append((int(data[0]), int(data[1])))
continue
# Monthly data
data[2] = data[2].replace('(', '').replace(')', '')
monthly_records.append((data[0], int(data[1]), int(data[2])))
pres_records = [
('Lyndon B. Johnson', datetime(1963, 11, 22), datetime(1969, 1, 20)),
('Richard Nixon', datetime(1969, 1, 20), datetime(1974, 8, 9)),
('Gerald Ford', datetime(1974, 8, 9), datetime(1977, 1, 20)),
('Jimmy Carter', datetime(1977, 1, 20), datetime(1981, 1, 20)),
('Ronald Reagan', datetime(1981, 1, 20), datetime(1989, 1, 20)),
('George H. W. Bush', datetime(1989, 1, 20), datetime(1993, 1, 20)),
('Bill Clinton', datetime(1993, 1, 20), datetime(2001, 1, 20)),
('George W. Bush', datetime(2001, 1, 20), datetime(2009, 1, 20)),
('Barack Obama', datetime(2009, 1, 20), datetime(2017, 1, 20)),
('Donald Trump', datetime(2017, 1, 20), datetime(2021, 1, 20)),
('Joe Biden', datetime(2021, 1, 20), datetime(2023, 6, 28)) # cut Biden short so that it lines up with our last data point
]
pres_df = pd.DataFrame.from_records(pres_records, columns=['name', 'start', 'end'])
# Clean the data
month2int = {
'January': 1,
'February': 2,
'March': 3,
'April': 4,
'May': 5,
'June': 6,
'July': 7,
'August': 8,
'September': 9,
'October': 10,
'November': 11,
'December': 12
}
mn_df = pd.DataFrame.from_records(monthly_records, columns=['month', 'year', 'cases'])
dts = []
for i, r in mn_df.iterrows():
dts.append(datetime(year=r['year'], month=month2int[r['month']], day=28))
mn_df['date'] = dts
# This is the first year that has more than 1 case
clipped_mn_df = mn_df.query('year >= 1964')
# add 0s for months that are missing
# we cut off at 1964 but Johnson started in November of 1963
# There weren't any cases in 1963 so it's okay to start
# filling 0s from November of 1963
cur_yr = 1963
cur_mn = 11
new_rows = []
# pandas `in` is busted so we have to pull out the column manually
# and check against that
existing_dates = clipped_mn_df['date'].to_numpy(dtype=datetime)
# our data goes through the end of the previous month (june 2023)
# we're using 28 as the placeholder "day" for all the months
while cur_yr < 2023 or cur_mn <= 6:
dt = datetime(year=cur_yr, month=cur_mn, day=28)
if dt not in existing_dates:
new_rows.append((dt.strftime('%B'), dt.year, 0, dt))
if cur_mn == 12:
cur_yr += 1
cur_mn = 1
else:
cur_mn += 1
zero_rows = pd.DataFrame.from_records(new_rows, columns=['month', 'year', 'cases', 'date'])
clipped_mn_df = pd.concat([clipped_mn_df, zero_rows], ignore_index=True)
clipped_mn_df = clipped_mn_df.sort_values(by='date', ascending=False).reset_index(drop=True)
# add the mean & std for each president
presidents = []
for d in clipped_mn_df['date']:
for i, r in pres_df.iterrows():
if d >= r['start'] and d <= r['end']:
presidents.append(str(r['name']))
clipped_mn_df['pres'] = presidents
tmp = clipped_mn_df[['cases', 'pres']].groupby('pres').agg(['mean', 'std']).reset_index(drop=False)
tmp.columns = ['name', 'cases_mean', 'cases_std']
pres_df = pd.merge(pres_df, tmp, on='name', how='inner')
# bootstrap confidence intervals for the mean
# the data doesn't really look normal enough for 2 std to be super meaningful
pres_names = pres_df['name'].unique()
president_cis = []
for pres in pres_names:
cases = clipped_mn_df.query(f'pres == "{pres}"')['cases'].to_numpy()
ci = bootstrap(
cases.reshape(1,-1),
np.mean,
vectorized=False,
confidence_level=0.95,
method='BCa' # "bias-corrected and accelerated" (shifts the CI bounds if the distribution is skewed)
).confidence_interval
president_cis.append((pres, ci.low, ci.high))
ci_df = pd.DataFrame.from_records(president_cis, columns=['name', 'ci_low', 'ci_high'])
# add the confidence intervals to pres_df
pres_df = pd.merge(pres_df, ci_df, on='name')
# Utils for converting colors
def hex2rgb(h):
"""
'#FF44BB' -> 'rgb(255, 68, 187)'
"""
if h[0] == '#':
h = h[1:]
if len(h) != 6:
raise ValueError(f'malformed hex input')
values = []
for i in range(0, len(h), 2):
values.append(int(h[i:i+2], base=16))
return f'rgb({values[0]}, {values[1]}, {values[2]})'
def rgb2rgba(c, a=1.0):
"""
'rgb(95, 70, 144)'
->
'rgba(95, 70, 144)'
->
'rgba(95, 70, 144, 1.0)
defaults to 100% opacity
but you can set it
"""
c = c[:3] + 'a' + c[3:]
c = c[:-1] + f', {a})'
return c
# Draw the plot
# streamlit ignores this but streamlit's theme
# is pure white so it's okay I guess?
pio.templates.default = 'plotly_white'
f = go.Figure()
FONT_SIZE = 14
# add the cases as a bar plot
bar_color = '#bbbbbb'
f.add_trace(go.Bar(
x=clipped_mn_df['date'],
y=clipped_mn_df['cases'],
name='DOJ Antitrust Cases',
marker_color=bar_color,
marker_line_color=bar_color,
hovertemplate='%{x}: <b>%{y}</b><extra></extra>',
hoverlabel={'bgcolor': rgb2rgba(hex2rgb(bar_color), 0.2), 'font': {'size': FONT_SIZE}},
legendrank=1000 + 1 # default is 1000. Bigger means closer to the top
))
# add the president means + CI
pres_colors = px.colors.qualitative.Prism
for i, r in pres_df.iterrows():
# set up colors for this president
pres_color = pres_colors[i]
if pres_color[0] == '#':
pres_color = hex2rgb(pres_color)
ci_color = rgb2rgba(pres_color, 0.5)
hover_color = rgb2rgba(pres_color, 0.2)
hover_str = f"<b>{r['name']}</b><br>Mean: <b>{r['cases_mean']:.2f}</b><br>95% CI: <b>({r['ci_low']:.2f}–{r['ci_high']:.2f})</b><extra></extra>"
hover_label_fmt = {'bgcolor': hover_color, 'font': {'size': FONT_SIZE}}
# add this president's confidence interval
#
# draw two lines like this
#
# o------------------o
#
# o------------------o
#
# make the lines transparent,
# fill in the area between them
upper = r['ci_high']
lower = r['ci_low']
f.add_trace(go.Scatter(
x = [r['start'], r['end'], r['end'], r['start']],
y = [upper, upper, lower, lower],
fill='toself',
fillcolor=ci_color,
line_color=rgb2rgba(pres_color, 0),
# I have to set `name` for it to show up when I hover over any part of the fill
# otherwise the hover only comes up when I hover over the corners where the points are
# but `name` doesn't do the <extra></extra> thing to remove the extra hover box
name=hover_str.replace('<extra></extra>',''),
showlegend=False,
hovertemplate=hover_str,
hoverlabel=hover_label_fmt
))
# add this president's mean
f.add_trace(go.Scatter(
x=[r['start'], r['end']],
y=[r['cases_mean'],r['cases_mean']],
name=r['name'],
line_color=pres_color,
# I used to have vertical bars at the ends of the mean line
# but I like it more without them
# so just set the width to 0
marker={'symbol': 'line-ns', 'line': {'width': 0, 'color':pres_color}},
hovertemplate=hover_str,
hoverlabel=hover_label_fmt
))
# Trim the top of the plot a bit because there are a few outliers
# that make it hard to see the president aggregations
MAX_HEIGHT = 16
f.update_yaxes(range=[0, MAX_HEIGHT])
# add hashing over any bars taller than MAX_HEIGHT
# since we're cutting them off
too_tall = clipped_mn_df[clipped_mn_df['cases'] > MAX_HEIGHT]['date']
f.add_trace(go.Bar(
x=too_tall,
y=[MAX_HEIGHT * 0.25] * len(too_tall),
base = [MAX_HEIGHT - MAX_HEIGHT*0.1] * len(too_tall),
marker_color='#fff',
marker_line_color='rgba(255,255,255,0)',
marker_line_width=0,
# I think I remember plotly uses milliseconds if the axis is a datetime
# so the width has to be huge to cover a whole month
# yep 1 month is 2.6 * 10**9 milliseconds
width=3e9,
# these are the options ['', '/', '\\', 'x', '-', '|', '+', '.']
marker_pattern_shape='-',
marker_pattern_fillmode='replace',
showlegend=False
))
f.update_layout(barmode='stack')
f.update_layout(title="<b>What does the DOJ's Antitrust Division look like?</b>")
# since streamlit doesn't respect the Plotly theme,
# we can instead make the background transparent
f.update_layout({
'plot_bgcolor': 'rgba(0, 0, 0, 0)',
'paper_bgcolor': 'rgba(0, 0, 0, 0)',
})
st.set_page_config(layout='wide')
st.plotly_chart(f, use_container_width=True, theme=None) |