bbcardsnmore / app.py
carolanderson's picture
change csv to tsv in file download
85379ec
import math
import pandas as pd
import streamlit as st
@st.cache_data
def load_data(file, skiprows=0, encoding="ISO-8859-1"):
df = pd.read_csv(file, sep="\t", skiprows=skiprows, encoding=encoding)
return df
@st.cache_data
def convert_df(df):
return df.to_csv(sep="\t", index=False).encode('utf-8')
@st.cache_data
def extract_surname(name):
"""Add first name and last name columns"""
non_names = ['1992 Hoops set', '2016 Topps Now Highlights', 'Fox-Aparicio',
'1960 World Series', 'Mantle/Berra', 'Ashburn-Mays', 'Ruth/Aaron/Mays',
'Mays/Snider', 'New York Yankees', 'Checklist']
if not isinstance(name, str):
return None, None
if name == "G Hill Tribute":
return "G", "Hill"
if name == 'Ken Griffey, Jr.' or name=='Ken Griffey Jr.':
return "Ken", "Griffey Jr."
elif 'Vladimir Guerrero Jr.' in name:
return "Vladimir", 'Guerrero Jr.'
elif any(item in name for item in non_names):
return 'multiple', 'multiple'
elif "," in name:
return "multiple", 'multiple'
else:
if "-" in name or "/" in name:
print(name)
raise Exception("Found suspected multiple-name card!")
return " ".join(name.split()[:-1]), name.split()[-1]
@st.cache_data
def add_grading_status(grader):
"""Add graded column (yes/no)"""
if isinstance(grader, str):
return "Yes"
elif math.isnan(grader):
return "No"
else:
print(grader)
raise Exception("Found unexpected item in Grader column!")
@st.cache_data
def get_default_sort_order(df):
default_cols = [c for c in ["Type", "Graded", "Sport", "Last Name", "Year"] if \
c in df.columns.values]
default_sort_order = [{"column" : item, "order": i }for i, item in \
enumerate(default_cols)]
for col_name in df.columns.values:
if col_name not in default_cols:
default_sort_order.append({"column" : col_name, "order": None})
return pd.DataFrame(default_sort_order)
@st.cache_data
def get_sort_order(edited_sort_order):
sort_columns = edited_sort_order.dropna(subset="order")
cols = sort_columns.column.tolist()
orders = sort_columns.order.tolist()
return [col for (col, _) in sorted(zip(cols, orders), key=lambda x: x[1])]
@st.cache_data
def add_graded_column(df):
if "Grader" in df:
df["Graded"] = df['Grader'].apply(lambda x: add_grading_status(x))
df["Graded"] = pd.Categorical(df["Graded"], categories = ["Yes", "No"]) # sets sort order
df = df.drop(columns=['Grader'])
else:
st.warning('Input data must have a "Grader" column'
' in order to create a "Graded" column', icon="⚠️")
return df
@st.cache_data
def add_multiple_column(df):
if "Quantity" in df:
df['Multiple'] = df['Quantity'].apply(lambda x: "Yes" if x>1 else "No")
df["Multiple"] = pd.Categorical(df["Multiple"], categories = ["Yes", "No"])
df = df.drop(columns=['Quantity'])
else:
st.warning('Input data must have a "Quantity" column'
' in order to do this', icon="⚠️")
return df
@st.cache_data
def add_first_and_last_name_columns(df):
if "Player Name" in df:
df['First Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[0])
df['Last Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[-1])
else:
st.warning('Input data must have a "Player Name" column'
' in order to extract first and last names', icon="⚠️")
return df
if __name__ == "__main__":
st.markdown("# Baseball card data wrangling")
st.write("Upload a tab-separated spreadsheet. The first row should contain column"
" names.")
input_file = st.file_uploader("Choose a file", type=['txt', 'csv', 'tsv', 'xlsx'])
if st.checkbox("Use sample data 1 (baseball cards)"):
input_file = "sample_data/sample_data_1.txt"
elif st.checkbox("Use sample data 2 (big cats)"):
input_file = "sample_data/sample_data_2.txt"
if input_file is not None:
df = load_data(input_file)
st.subheader('Input data')
st.write(df)
if st.checkbox('Create first name and last name columns'):
df = add_first_and_last_name_columns(df)
if st.checkbox('Add "Graded" column and remove "Grader" column'):
df = add_graded_column(df)
if st.checkbox('Create "Multiple" column and remove "Quantity" column'):
df = add_multiple_column(df)
if st.checkbox("Change sort order"):
st.subheader("Column sort order")
st.write("Edit the sort priority by changing the numbers in the table below."
" Click the sort button below when you're done.")
default_sort_order = get_default_sort_order(df)
edited_sort_order = st.data_editor(default_sort_order)
do_sort = st.button("Sort")
if do_sort:
col_order = get_sort_order(edited_sort_order)
st.subheader('Sorted output data')
df = df.sort_values(by=col_order).reset_index(drop=True)
st.subheader('Output data')
st.write(df)
output_file = st.text_input("Enter name for the file to be downloaded", value="cards_output.tsv")
if output_file is not None:
csv = convert_df(df)
st.download_button("Download data as tab-separated values", csv, file_name=output_file)