Spaces:
Sleeping
Sleeping
import math | |
import pandas as pd | |
import streamlit as st | |
def load_data(file, skiprows=0, encoding="ISO-8859-1"): | |
df = pd.read_csv(file, sep="\t", skiprows=skiprows, encoding=encoding) | |
return df | |
def convert_df(df): | |
return df.to_csv(sep="\t", index=False).encode('utf-8') | |
def extract_surname(name): | |
"""Add first name and last name columns""" | |
non_names = ['1992 Hoops set', '2016 Topps Now Highlights', 'Fox-Aparicio', | |
'1960 World Series', 'Mantle/Berra', 'Ashburn-Mays', 'Ruth/Aaron/Mays', | |
'Mays/Snider', 'New York Yankees', 'Checklist'] | |
if not isinstance(name, str): | |
return None, None | |
if name == "G Hill Tribute": | |
return "G", "Hill" | |
if name == 'Ken Griffey, Jr.' or name=='Ken Griffey Jr.': | |
return "Ken", "Griffey Jr." | |
elif 'Vladimir Guerrero Jr.' in name: | |
return "Vladimir", 'Guerrero Jr.' | |
elif any(item in name for item in non_names): | |
return 'multiple', 'multiple' | |
elif "," in name: | |
return "multiple", 'multiple' | |
else: | |
if "-" in name or "/" in name: | |
print(name) | |
raise Exception("Found suspected multiple-name card!") | |
return " ".join(name.split()[:-1]), name.split()[-1] | |
def add_grading_status(grader): | |
"""Add graded column (yes/no)""" | |
if isinstance(grader, str): | |
return "Yes" | |
elif math.isnan(grader): | |
return "No" | |
else: | |
print(grader) | |
raise Exception("Found unexpected item in Grader column!") | |
def get_default_sort_order(df): | |
default_cols = [c for c in ["Type", "Graded", "Sport", "Last Name", "Year"] if \ | |
c in df.columns.values] | |
default_sort_order = [{"column" : item, "order": i }for i, item in \ | |
enumerate(default_cols)] | |
for col_name in df.columns.values: | |
if col_name not in default_cols: | |
default_sort_order.append({"column" : col_name, "order": None}) | |
return pd.DataFrame(default_sort_order) | |
def get_sort_order(edited_sort_order): | |
sort_columns = edited_sort_order.dropna(subset="order") | |
cols = sort_columns.column.tolist() | |
orders = sort_columns.order.tolist() | |
return [col for (col, _) in sorted(zip(cols, orders), key=lambda x: x[1])] | |
def add_graded_column(df): | |
if "Grader" in df: | |
df["Graded"] = df['Grader'].apply(lambda x: add_grading_status(x)) | |
df["Graded"] = pd.Categorical(df["Graded"], categories = ["Yes", "No"]) # sets sort order | |
df = df.drop(columns=['Grader']) | |
else: | |
st.warning('Input data must have a "Grader" column' | |
' in order to create a "Graded" column', icon="⚠️") | |
return df | |
def add_multiple_column(df): | |
if "Quantity" in df: | |
df['Multiple'] = df['Quantity'].apply(lambda x: "Yes" if x>1 else "No") | |
df["Multiple"] = pd.Categorical(df["Multiple"], categories = ["Yes", "No"]) | |
df = df.drop(columns=['Quantity']) | |
else: | |
st.warning('Input data must have a "Quantity" column' | |
' in order to do this', icon="⚠️") | |
return df | |
def add_first_and_last_name_columns(df): | |
if "Player Name" in df: | |
df['First Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[0]) | |
df['Last Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[-1]) | |
else: | |
st.warning('Input data must have a "Player Name" column' | |
' in order to extract first and last names', icon="⚠️") | |
return df | |
if __name__ == "__main__": | |
st.markdown("# Baseball card data wrangling") | |
st.write("Upload a tab-separated spreadsheet. The first row should contain column" | |
" names.") | |
input_file = st.file_uploader("Choose a file", type=['txt', 'csv', 'tsv', 'xlsx']) | |
if st.checkbox("Use sample data 1 (baseball cards)"): | |
input_file = "sample_data/sample_data_1.txt" | |
elif st.checkbox("Use sample data 2 (big cats)"): | |
input_file = "sample_data/sample_data_2.txt" | |
if input_file is not None: | |
df = load_data(input_file) | |
st.subheader('Input data') | |
st.write(df) | |
if st.checkbox('Create first name and last name columns'): | |
df = add_first_and_last_name_columns(df) | |
if st.checkbox('Add "Graded" column and remove "Grader" column'): | |
df = add_graded_column(df) | |
if st.checkbox('Create "Multiple" column and remove "Quantity" column'): | |
df = add_multiple_column(df) | |
if st.checkbox("Change sort order"): | |
st.subheader("Column sort order") | |
st.write("Edit the sort priority by changing the numbers in the table below." | |
" Click the sort button below when you're done.") | |
default_sort_order = get_default_sort_order(df) | |
edited_sort_order = st.data_editor(default_sort_order) | |
do_sort = st.button("Sort") | |
if do_sort: | |
col_order = get_sort_order(edited_sort_order) | |
st.subheader('Sorted output data') | |
df = df.sort_values(by=col_order).reset_index(drop=True) | |
st.subheader('Output data') | |
st.write(df) | |
output_file = st.text_input("Enter name for the file to be downloaded", value="cards_output.tsv") | |
if output_file is not None: | |
csv = convert_df(df) | |
st.download_button("Download data as tab-separated values", csv, file_name=output_file) | |