Spaces:
Sleeping
Sleeping
import numpy as np | |
import pandas as pd | |
import plotly.express as px | |
import plotly.graph_objects as go | |
import plotly.io as pio | |
import streamlit as st | |
from datetime import datetime | |
from pprint import pprint | |
from scipy.stats import bootstrap | |
# Load data | |
with open('data.txt', 'r') as f: | |
cases_data = f.readlines() | |
monthly_records = [] | |
annual_records = [] | |
for case_count in cases_data: | |
data = case_count.split() | |
# Annual data | |
if len(data) == 2: | |
data[1] = data[1].replace('(', '').replace(')', '') | |
annual_records.append((int(data[0]), int(data[1]))) | |
continue | |
# Monthly data | |
data[2] = data[2].replace('(', '').replace(')', '') | |
monthly_records.append((data[0], int(data[1]), int(data[2]))) | |
pres_records = [ | |
('Lyndon B. Johnson', datetime(1963, 11, 22), datetime(1969, 1, 20)), | |
('Richard Nixon', datetime(1969, 1, 20), datetime(1974, 8, 9)), | |
('Gerald Ford', datetime(1974, 8, 9), datetime(1977, 1, 20)), | |
('Jimmy Carter', datetime(1977, 1, 20), datetime(1981, 1, 20)), | |
('Ronald Reagan', datetime(1981, 1, 20), datetime(1989, 1, 20)), | |
('George H. W. Bush', datetime(1989, 1, 20), datetime(1993, 1, 20)), | |
('Bill Clinton', datetime(1993, 1, 20), datetime(2001, 1, 20)), | |
('George W. Bush', datetime(2001, 1, 20), datetime(2009, 1, 20)), | |
('Barack Obama', datetime(2009, 1, 20), datetime(2017, 1, 20)), | |
('Donald Trump', datetime(2017, 1, 20), datetime(2021, 1, 20)), | |
('Joe Biden', datetime(2021, 1, 20), datetime(2023, 6, 28)) # cut Biden short so that it lines up with our last data point | |
] | |
pres_df = pd.DataFrame.from_records(pres_records, columns=['name', 'start', 'end']) | |
# Clean the data | |
month2int = { | |
'January': 1, | |
'February': 2, | |
'March': 3, | |
'April': 4, | |
'May': 5, | |
'June': 6, | |
'July': 7, | |
'August': 8, | |
'September': 9, | |
'October': 10, | |
'November': 11, | |
'December': 12 | |
} | |
mn_df = pd.DataFrame.from_records(monthly_records, columns=['month', 'year', 'cases']) | |
dts = [] | |
for i, r in mn_df.iterrows(): | |
dts.append(datetime(year=r['year'], month=month2int[r['month']], day=28)) | |
mn_df['date'] = dts | |
# This is the first year that has more than 1 case | |
clipped_mn_df = mn_df.query('year >= 1964') | |
# add 0s for months that are missing | |
# we cut off at 1964 but Johnson started in November of 1963 | |
# There weren't any cases in 1963 so it's okay to start | |
# filling 0s from November of 1963 | |
cur_yr = 1963 | |
cur_mn = 11 | |
new_rows = [] | |
# pandas `in` is busted so we have to pull out the column manually | |
# and check against that | |
existing_dates = clipped_mn_df['date'].to_numpy(dtype=datetime) | |
# our data goes through the end of the previous month (june 2023) | |
# we're using 28 as the placeholder "day" for all the months | |
while cur_yr < 2023 or cur_mn <= 6: | |
dt = datetime(year=cur_yr, month=cur_mn, day=28) | |
if dt not in existing_dates: | |
new_rows.append((dt.strftime('%B'), dt.year, 0, dt)) | |
if cur_mn == 12: | |
cur_yr += 1 | |
cur_mn = 1 | |
else: | |
cur_mn += 1 | |
zero_rows = pd.DataFrame.from_records(new_rows, columns=['month', 'year', 'cases', 'date']) | |
clipped_mn_df = pd.concat([clipped_mn_df, zero_rows], ignore_index=True) | |
clipped_mn_df = clipped_mn_df.sort_values(by='date', ascending=False).reset_index(drop=True) | |
# add the mean & std for each president | |
presidents = [] | |
for d in clipped_mn_df['date']: | |
for i, r in pres_df.iterrows(): | |
if d >= r['start'] and d <= r['end']: | |
presidents.append(str(r['name'])) | |
clipped_mn_df['pres'] = presidents | |
tmp = clipped_mn_df[['cases', 'pres']].groupby('pres').agg(['mean', 'std']).reset_index(drop=False) | |
tmp.columns = ['name', 'cases_mean', 'cases_std'] | |
pres_df = pd.merge(pres_df, tmp, on='name', how='inner') | |
# bootstrap confidence intervals for the mean | |
# the data doesn't really look normal enough for 2 std to be super meaningful | |
pres_names = pres_df['name'].unique() | |
president_cis = [] | |
for pres in pres_names: | |
cases = clipped_mn_df.query(f'pres == "{pres}"')['cases'].to_numpy() | |
ci = bootstrap( | |
cases.reshape(1,-1), | |
np.mean, | |
vectorized=False, | |
confidence_level=0.95, | |
method='BCa' # "bias-corrected and accelerated" (shifts the CI bounds if the distribution is skewed) | |
).confidence_interval | |
president_cis.append((pres, ci.low, ci.high)) | |
ci_df = pd.DataFrame.from_records(president_cis, columns=['name', 'ci_low', 'ci_high']) | |
# add the confidence intervals to pres_df | |
pres_df = pd.merge(pres_df, ci_df, on='name') | |
# Utils for converting colors | |
def hex2rgb(h): | |
""" | |
'#FF44BB' -> 'rgb(255, 68, 187)' | |
""" | |
if h[0] == '#': | |
h = h[1:] | |
if len(h) != 6: | |
raise ValueError(f'malformed hex input') | |
values = [] | |
for i in range(0, len(h), 2): | |
values.append(int(h[i:i+2], base=16)) | |
return f'rgb({values[0]}, {values[1]}, {values[2]})' | |
def rgb2rgba(c, a=1.0): | |
""" | |
'rgb(95, 70, 144)' | |
-> | |
'rgba(95, 70, 144)' | |
-> | |
'rgba(95, 70, 144, 1.0) | |
defaults to 100% opacity | |
but you can set it | |
""" | |
c = c[:3] + 'a' + c[3:] | |
c = c[:-1] + f', {a})' | |
return c | |
# Draw the plot | |
# streamlit ignores this but streamlit's theme | |
# is pure white so it's okay I guess? | |
pio.templates.default = 'plotly_white' | |
f = go.Figure() | |
FONT_SIZE = 14 | |
# add the cases as a bar plot | |
bar_color = '#bbbbbb' | |
f.add_trace(go.Bar( | |
x=clipped_mn_df['date'], | |
y=clipped_mn_df['cases'], | |
name='DOJ Antitrust Cases', | |
marker_color=bar_color, | |
marker_line_color=bar_color, | |
hovertemplate='%{x}: <b>%{y}</b><extra></extra>', | |
hoverlabel={'bgcolor': rgb2rgba(hex2rgb(bar_color), 0.2), 'font': {'size': FONT_SIZE}}, | |
legendrank=1000 + 1 # default is 1000. Bigger means closer to the top | |
)) | |
# add the president means + CI | |
pres_colors = px.colors.qualitative.Prism | |
for i, r in pres_df.iterrows(): | |
# set up colors for this president | |
pres_color = pres_colors[i] | |
if pres_color[0] == '#': | |
pres_color = hex2rgb(pres_color) | |
ci_color = rgb2rgba(pres_color, 0.5) | |
hover_color = rgb2rgba(pres_color, 0.2) | |
hover_str = f"<b>{r['name']}</b><br>Mean: <b>{r['cases_mean']:.2f}</b><br>95% CI: <b>({r['ci_low']:.2f}–{r['ci_high']:.2f})</b><extra></extra>" | |
hover_label_fmt = {'bgcolor': hover_color, 'font': {'size': FONT_SIZE}} | |
# add this president's confidence interval | |
# | |
# draw two lines like this | |
# | |
# o------------------o | |
# | |
# o------------------o | |
# | |
# make the lines transparent, | |
# fill in the area between them | |
upper = r['ci_high'] | |
lower = r['ci_low'] | |
f.add_trace(go.Scatter( | |
x = [r['start'], r['end'], r['end'], r['start']], | |
y = [upper, upper, lower, lower], | |
fill='toself', | |
fillcolor=ci_color, | |
line_color=rgb2rgba(pres_color, 0), | |
# I have to set `name` for it to show up when I hover over any part of the fill | |
# otherwise the hover only comes up when I hover over the corners where the points are | |
# but `name` doesn't do the <extra></extra> thing to remove the extra hover box | |
name=hover_str.replace('<extra></extra>',''), | |
showlegend=False, | |
hovertemplate=hover_str, | |
hoverlabel=hover_label_fmt | |
)) | |
# add this president's mean | |
f.add_trace(go.Scatter( | |
x=[r['start'], r['end']], | |
y=[r['cases_mean'],r['cases_mean']], | |
name=r['name'], | |
line_color=pres_color, | |
# I used to have vertical bars at the ends of the mean line | |
# but I like it more without them | |
# so just set the width to 0 | |
marker={'symbol': 'line-ns', 'line': {'width': 0, 'color':pres_color}}, | |
hovertemplate=hover_str, | |
hoverlabel=hover_label_fmt | |
)) | |
# Trim the top of the plot a bit because there are a few outliers | |
# that make it hard to see the president aggregations | |
MAX_HEIGHT = 16 | |
f.update_yaxes(range=[0, MAX_HEIGHT]) | |
# add hashing over any bars taller than MAX_HEIGHT | |
# since we're cutting them off | |
too_tall = clipped_mn_df[clipped_mn_df['cases'] > MAX_HEIGHT]['date'] | |
f.add_trace(go.Bar( | |
x=too_tall, | |
y=[MAX_HEIGHT * 0.25] * len(too_tall), | |
base = [MAX_HEIGHT - MAX_HEIGHT*0.1] * len(too_tall), | |
marker_color='#fff', | |
marker_line_color='rgba(255,255,255,0)', | |
marker_line_width=0, | |
# I think I remember plotly uses milliseconds if the axis is a datetime | |
# so the width has to be huge to cover a whole month | |
# yep 1 month is 2.6 * 10**9 milliseconds | |
width=3e9, | |
# these are the options ['', '/', '\\', 'x', '-', '|', '+', '.'] | |
marker_pattern_shape='-', | |
marker_pattern_fillmode='replace', | |
showlegend=False | |
)) | |
f.update_layout(barmode='stack') | |
f.update_layout(title="<b>What does the DOJ's Antitrust Division look like?</b>") | |
# since streamlit doesn't respect the Plotly theme, | |
# we can instead make the background transparent | |
f.update_layout({ | |
'plot_bgcolor': 'rgba(0, 0, 0, 0)', | |
'paper_bgcolor': 'rgba(0, 0, 0, 0)', | |
}) | |
st.set_page_config(layout='wide') | |
st.plotly_chart(f, use_container_width=True, theme=None) | |
col1, col2, col3 = st.columns(3) | |
col2.markdown(""" | |
# What is this? | |
I want to challenge myself to make a visualization I'm proud of. | |
If I ever find myself thinking | |
> I could probably clean that up but ugh that would be annoying I don't wanna | |
then I absolutely have to clean that up. | |
That's about the end of my thought process. | |
I've seen a few antitrust cases involving Big Tech companies in the news recently. | |
Those articles make me curious how common antitrust cases are in general. | |
How many does the DOJ file each month? | |
Are the numbers pretty consistent or are there large swings? | |
What does it look like? | |
""") | |
col2.markdown(""" | |
# The data | |
To get the data, I went to [the website for the Antitrust Division of the U.S. Department of Justice](https://www.justice.gov/atr/antitrust-case-filings), clicked "Filter by Case Open Date" in the left menu, and clicked "Show more." That gave me a pretty clean list that I could highlight and copy. | |
``` | |
June 2023 (2) | |
April 2023 (1) | |
March 2023 (1) | |
February 2023 (2) | |
January 2023 (5) | |
2023 (11) | |
November 2022 (4) | |
[ . . . ] | |
``` | |
There are some obvious problems with this data. | |
For example, I found [a Wikipedia article about U.S. antitrust law](https://en.wikipedia.org/wiki/United_States_antitrust_law). | |
That page refers to a case that happened in 1943, but my data doesn't have any cases in 1943. | |
My best guess is that this data only includes records the DOJ has digitized and made available on their website. | |
There's also at least one typo in this random menu on the DOJ's website. | |
For the annual count of all cases opened in 2022, they list the correct amount but they label it "2026" instead. | |
I didn't notice any other typos. | |
I'm sure there are a few I missed. | |
As I said above, I just want to turn some data into a pretty plot. | |
""") |