File size: 5,629 Bytes
ff2a8fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e486b9e
ff2a8fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e486b9e
ff2a8fc
 
e486b9e
ff2a8fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85379ec
ff2a8fc
 
85379ec
ff2a8fc
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
import math

import pandas as pd
import streamlit as st


@st.cache_data
def load_data(file, skiprows=0, encoding="ISO-8859-1"):
    df = pd.read_csv(file, sep="\t", skiprows=skiprows, encoding=encoding)
    return df


@st.cache_data
def convert_df(df):
    return df.to_csv(sep="\t", index=False).encode('utf-8')


@st.cache_data    
def extract_surname(name):
    """Add first name and last name columns"""
    non_names = ['1992 Hoops set', '2016 Topps Now Highlights', 'Fox-Aparicio', 
                 '1960 World Series', 'Mantle/Berra', 'Ashburn-Mays', 'Ruth/Aaron/Mays',
                'Mays/Snider', 'New York Yankees', 'Checklist']
    if not isinstance(name, str):
        return None, None
    if name == "G Hill Tribute":
        return "G", "Hill"
    if name == 'Ken Griffey, Jr.' or name=='Ken Griffey Jr.':
        return "Ken", "Griffey Jr."
    elif 'Vladimir Guerrero Jr.' in name:
        return "Vladimir", 'Guerrero Jr.'
    elif any(item in name for item in non_names):
        return 'multiple', 'multiple'
    elif "," in name:
        return "multiple", 'multiple'
    else:
        if "-" in name or "/" in name:
            print(name)
            raise Exception("Found suspected multiple-name card!")
        return " ".join(name.split()[:-1]), name.split()[-1]


@st.cache_data     
def add_grading_status(grader):
    """Add graded column (yes/no)"""
    if isinstance(grader, str):
        return "Yes"
    elif math.isnan(grader):
        return "No"
    else:
        print(grader)
        raise Exception("Found unexpected item in Grader column!")


@st.cache_data
def get_default_sort_order(df):
    default_cols = [c for c in ["Type", "Graded", "Sport", "Last Name", "Year"] if \
                    c in df.columns.values]
    default_sort_order = [{"column" : item, "order": i }for i, item in \
                          enumerate(default_cols)]
    for col_name in df.columns.values:
        if col_name not in default_cols:
            default_sort_order.append({"column" : col_name, "order": None})
    return pd.DataFrame(default_sort_order)


@st.cache_data
def get_sort_order(edited_sort_order):
    sort_columns = edited_sort_order.dropna(subset="order")
    cols = sort_columns.column.tolist()
    orders = sort_columns.order.tolist()
    return [col for (col, _) in sorted(zip(cols, orders), key=lambda x: x[1])]


@st.cache_data
def add_graded_column(df):
    if "Grader" in df:
        df["Graded"] = df['Grader'].apply(lambda x: add_grading_status(x))
        df["Graded"] = pd.Categorical(df["Graded"], categories = ["Yes", "No"])  # sets sort order
        df = df.drop(columns=['Grader'])
    else:
        st.warning('Input data must have a "Grader" column'
                 ' in order to create a "Graded" column', icon="⚠️")
    return df


@st.cache_data
def add_multiple_column(df):
    if "Quantity" in df:
        df['Multiple'] = df['Quantity'].apply(lambda x: "Yes" if x>1 else "No")
        df["Multiple"] = pd.Categorical(df["Multiple"], categories = ["Yes", "No"])
        df = df.drop(columns=['Quantity'])
    else:
        st.warning('Input data must have a "Quantity" column'
                 ' in order to do this', icon="⚠️")
    return df


@st.cache_data
def add_first_and_last_name_columns(df):
    if "Player Name" in df:
        df['First Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[0])
        df['Last Name'] = df['Player Name'].apply(lambda x: extract_surname(x)[-1])
    else:
        st.warning('Input data must have a "Player Name" column'
                 ' in order to extract first and last names', icon="⚠️")
    return df


if __name__ == "__main__":
    st.markdown("# Baseball card data wrangling")
    st.write("Upload a tab-separated spreadsheet. The first row should contain column"
             " names.")
    input_file = st.file_uploader("Choose a file", type=['txt', 'csv', 'tsv',  'xlsx'])

    if st.checkbox("Use sample data 1 (baseball cards)"):
        input_file = "sample_data/sample_data_1.txt"
    elif st.checkbox("Use sample data 2 (big cats)"):
        input_file = "sample_data/sample_data_2.txt"
        
    if input_file is not None:    
        df = load_data(input_file)
        st.subheader('Input data')
        st.write(df)

        if st.checkbox('Create first name and last name columns'):
            df = add_first_and_last_name_columns(df)

        if st.checkbox('Add "Graded" column and remove "Grader" column'):
            df = add_graded_column(df)

        if st.checkbox('Create "Multiple" column and remove "Quantity" column'):
            df = add_multiple_column(df)
        
        if st.checkbox("Change sort order"):
            st.subheader("Column sort order")
            st.write("Edit the sort priority by changing the numbers in the table below."
                     " Click the sort button below when you're done.")
            default_sort_order = get_default_sort_order(df)
            edited_sort_order = st.data_editor(default_sort_order)
    
            do_sort = st.button("Sort")
        
            if do_sort:
                col_order = get_sort_order(edited_sort_order)
                st.subheader('Sorted output data')
                df = df.sort_values(by=col_order).reset_index(drop=True)


        st.subheader('Output data')
        st.write(df)

        output_file = st.text_input("Enter name for the file to be downloaded", value="cards_output.tsv")
        if output_file is not None:
            csv = convert_df(df)
            st.download_button("Download data as tab-separated values", csv, file_name=output_file)