import math import pandas as pd import streamlit as st @st.cache_data def load_data(file, skiprows=0, encoding="ISO-8859-1"): df = pd.read_csv(file, sep="\t", skiprows=skiprows, encoding=encoding) return df @st.cache_data def convert_df(df): return df.to_csv(sep="\t", index=False).encode('utf-8') @st.cache_data def extract_surname(name): """Add first name and last name columns""" non_names = ['1992 Hoops set', '2016 Topps Now Highlights', 'Fox-Aparicio', '1960 World Series', 'Mantle/Berra', 'Ashburn-Mays', 'Ruth/Aaron/Mays', 'Mays/Snider', 'New York Yankees', 'Checklist'] if not isinstance(name, str): return None, None if name == "G Hill Tribute": return "G", "Hill" if name == 'Ken Griffey, Jr.' or name=='Ken Griffey Jr.': return "Ken", "Griffey Jr." elif 'Vladimir Guerrero Jr.' in name: return "Vladimir", 'Guerrero Jr.' elif any(item in name for item in non_names): return 'multiple', 'multiple' elif "," in name: return "multiple", 'multiple' else: if "-" in name or "/" in name: print(name) raise Exception("Found suspected multiple-name card!") return " ".join(name.split()[:-1]), name.split()[-1] @st.cache_data def add_grading_status(grader): """Add graded column (yes/no)""" if isinstance(grader, str): return "Yes" elif math.isnan(grader): return "No" else: print(grader) raise Exception("Found unexpected item in Grader column!") @st.cache_data def get_default_sort_order(df): default_cols = [c for c in ["Type", "Graded", "Sport", "Last Name", "Year"] if \ c in df.columns.values] default_sort_order = [{"column" : item, "order": i }for i, item in \ enumerate(default_cols)] for col_name in df.columns.values: if col_name not in default_cols: default_sort_order.append({"column" : col_name, "order": None}) return pd.DataFrame(default_sort_order) @st.cache_data def get_sort_order(edited_sort_order): sort_columns = edited_sort_order.dropna(subset="order") cols = sort_columns.column.tolist() orders = sort_columns.order.tolist() return [col for (col, _) in sorted(zip(cols, orders), key=lambda x: x[1])] @st.cache_data def add_graded_column(df): if "Grader" in df: df["Graded"] = df['Grader'].apply(lambda x: add_grading_status(x)) df["Graded"] = pd.Categorical(df["Graded"], categories = ["Yes", "No"]) # sets sort order df = df.drop(columns=['Grader']) else: st.warning('Input data must have a "Grader" column' ' in order to create a "Graded" column', icon="⚠️") return df @st.cache_data def add_multiple_column(df): if "Quantity" in df: df['Multiple'] = df['Quantity'].apply(lambda x: "Yes" if x>1 else "No") df["Multiple"] = pd.Categorical(df["Multiple"], categories = ["Yes", "No"]) df = df.drop(columns=['Quantity']) else: st.warning('Input data must have a "Quantity" column' ' in order to do this', icon="⚠️") return df @st.cache_data def add_first_and_last_name_columns(df): if "Player Name" in df: df['First Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[0]) df['Last Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[-1]) else: st.warning('Input data must have a "Player Name" column' ' in order to extract first and last names', icon="⚠️") return df if __name__ == "__main__": st.markdown("# Baseball card data wrangling") st.write("Upload a tab-separated spreadsheet. The first row should contain column" " names.") input_file = st.file_uploader("Choose a file", type=['txt', 'csv', 'tsv', 'xlsx']) if st.checkbox("Use sample data 1 (baseball cards)"): input_file = "sample_data/sample_data_1.txt" elif st.checkbox("Use sample data 2 (big cats)"): input_file = "sample_data/sample_data_2.txt" if input_file is not None: df = load_data(input_file) st.subheader('Input data') st.write(df) if st.checkbox('Create first name and last name columns'): df = add_first_and_last_name_columns(df) if st.checkbox('Add "Graded" column and remove "Grader" column'): df = add_graded_column(df) if st.checkbox('Create "Multiple" column and remove "Quantity" column'): df = add_multiple_column(df) if st.checkbox("Change sort order"): st.subheader("Column sort order") st.write("Edit the sort priority by changing the numbers in the table below." " Click the sort button below when you're done.") default_sort_order = get_default_sort_order(df) edited_sort_order = st.data_editor(default_sort_order) do_sort = st.button("Sort") if do_sort: col_order = get_sort_order(edited_sort_order) st.subheader('Sorted output data') df = df.sort_values(by=col_order).reset_index(drop=True) st.subheader('Output data') st.write(df) output_file = st.text_input("Enter name for the file to be downloaded", value="cards_output.tsv") if output_file is not None: csv = convert_df(df) st.download_button("Download data as tab-separated values", csv, file_name=output_file)