Spaces:
Sleeping
Sleeping
File size: 5,629 Bytes
ff2a8fc e486b9e ff2a8fc e486b9e ff2a8fc e486b9e ff2a8fc 85379ec ff2a8fc 85379ec ff2a8fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
import math
import pandas as pd
import streamlit as st
@st.cache_data
def load_data(file, skiprows=0, encoding="ISO-8859-1"):
df = pd.read_csv(file, sep="\t", skiprows=skiprows, encoding=encoding)
return df
@st.cache_data
def convert_df(df):
return df.to_csv(sep="\t", index=False).encode('utf-8')
@st.cache_data
def extract_surname(name):
"""Add first name and last name columns"""
non_names = ['1992 Hoops set', '2016 Topps Now Highlights', 'Fox-Aparicio',
'1960 World Series', 'Mantle/Berra', 'Ashburn-Mays', 'Ruth/Aaron/Mays',
'Mays/Snider', 'New York Yankees', 'Checklist']
if not isinstance(name, str):
return None, None
if name == "G Hill Tribute":
return "G", "Hill"
if name == 'Ken Griffey, Jr.' or name=='Ken Griffey Jr.':
return "Ken", "Griffey Jr."
elif 'Vladimir Guerrero Jr.' in name:
return "Vladimir", 'Guerrero Jr.'
elif any(item in name for item in non_names):
return 'multiple', 'multiple'
elif "," in name:
return "multiple", 'multiple'
else:
if "-" in name or "/" in name:
print(name)
raise Exception("Found suspected multiple-name card!")
return " ".join(name.split()[:-1]), name.split()[-1]
@st.cache_data
def add_grading_status(grader):
"""Add graded column (yes/no)"""
if isinstance(grader, str):
return "Yes"
elif math.isnan(grader):
return "No"
else:
print(grader)
raise Exception("Found unexpected item in Grader column!")
@st.cache_data
def get_default_sort_order(df):
default_cols = [c for c in ["Type", "Graded", "Sport", "Last Name", "Year"] if \
c in df.columns.values]
default_sort_order = [{"column" : item, "order": i }for i, item in \
enumerate(default_cols)]
for col_name in df.columns.values:
if col_name not in default_cols:
default_sort_order.append({"column" : col_name, "order": None})
return pd.DataFrame(default_sort_order)
@st.cache_data
def get_sort_order(edited_sort_order):
sort_columns = edited_sort_order.dropna(subset="order")
cols = sort_columns.column.tolist()
orders = sort_columns.order.tolist()
return [col for (col, _) in sorted(zip(cols, orders), key=lambda x: x[1])]
@st.cache_data
def add_graded_column(df):
if "Grader" in df:
df["Graded"] = df['Grader'].apply(lambda x: add_grading_status(x))
df["Graded"] = pd.Categorical(df["Graded"], categories = ["Yes", "No"]) # sets sort order
df = df.drop(columns=['Grader'])
else:
st.warning('Input data must have a "Grader" column'
' in order to create a "Graded" column', icon="⚠️")
return df
@st.cache_data
def add_multiple_column(df):
if "Quantity" in df:
df['Multiple'] = df['Quantity'].apply(lambda x: "Yes" if x>1 else "No")
df["Multiple"] = pd.Categorical(df["Multiple"], categories = ["Yes", "No"])
df = df.drop(columns=['Quantity'])
else:
st.warning('Input data must have a "Quantity" column'
' in order to do this', icon="⚠️")
return df
@st.cache_data
def add_first_and_last_name_columns(df):
if "Player Name" in df:
df['First Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[0])
df['Last Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[-1])
else:
st.warning('Input data must have a "Player Name" column'
' in order to extract first and last names', icon="⚠️")
return df
if __name__ == "__main__":
st.markdown("# Baseball card data wrangling")
st.write("Upload a tab-separated spreadsheet. The first row should contain column"
" names.")
input_file = st.file_uploader("Choose a file", type=['txt', 'csv', 'tsv', 'xlsx'])
if st.checkbox("Use sample data 1 (baseball cards)"):
input_file = "sample_data/sample_data_1.txt"
elif st.checkbox("Use sample data 2 (big cats)"):
input_file = "sample_data/sample_data_2.txt"
if input_file is not None:
df = load_data(input_file)
st.subheader('Input data')
st.write(df)
if st.checkbox('Create first name and last name columns'):
df = add_first_and_last_name_columns(df)
if st.checkbox('Add "Graded" column and remove "Grader" column'):
df = add_graded_column(df)
if st.checkbox('Create "Multiple" column and remove "Quantity" column'):
df = add_multiple_column(df)
if st.checkbox("Change sort order"):
st.subheader("Column sort order")
st.write("Edit the sort priority by changing the numbers in the table below."
" Click the sort button below when you're done.")
default_sort_order = get_default_sort_order(df)
edited_sort_order = st.data_editor(default_sort_order)
do_sort = st.button("Sort")
if do_sort:
col_order = get_sort_order(edited_sort_order)
st.subheader('Sorted output data')
df = df.sort_values(by=col_order).reset_index(drop=True)
st.subheader('Output data')
st.write(df)
output_file = st.text_input("Enter name for the file to be downloaded", value="cards_output.tsv")
if output_file is not None:
csv = convert_df(df)
st.download_button("Download data as tab-separated values", csv, file_name=output_file)
|