Spaces:
Runtime error
Runtime error
Adding example texts to show
Browse files
app.py
CHANGED
@@ -9,11 +9,21 @@ import seaborn as sns
|
|
9 |
import numpy as np
|
10 |
import plotly.figure_factory as ff
|
11 |
import plotly.express as px
|
|
|
12 |
|
13 |
@st.cache_data
|
14 |
def load_data():
|
15 |
return pd.read_csv('MassiveDatasetValidationData.csv')
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# TODO allow new tokenizers from HF
|
18 |
tokenizer_names_to_test = [
|
19 |
"openai/gpt4",
|
@@ -55,7 +65,7 @@ with st.sidebar:
|
|
55 |
languages = st.multiselect(
|
56 |
'Select languages',
|
57 |
options=sorted(val_data.lang.unique()),
|
58 |
-
default=['English', 'Spanish' ,'Chinese'],
|
59 |
max_selections=6,
|
60 |
label_visibility='collapsed'
|
61 |
)
|
@@ -82,7 +92,7 @@ with st.container():
|
|
82 |
subset_df = val_data[val_data.lang.isin(languages)]
|
83 |
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
|
84 |
|
85 |
-
st.header('
|
86 |
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
|
87 |
|
88 |
fig.update_layout(
|
@@ -100,6 +110,22 @@ with st.container():
|
|
100 |
metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
|
101 |
|
102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
103 |
with st.expander("About the project"):
|
104 |
-
st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in
|
105 |
|
|
|
9 |
import numpy as np
|
10 |
import plotly.figure_factory as ff
|
11 |
import plotly.express as px
|
12 |
+
import random
|
13 |
|
14 |
@st.cache_data
|
15 |
def load_data():
|
16 |
return pd.read_csv('MassiveDatasetValidationData.csv')
|
17 |
|
18 |
+
def reload_example_text_data():
|
19 |
+
random_id = random.choice(val_data['id'])
|
20 |
+
tempdf = subset_df[subset_df['id']==random_id]
|
21 |
+
tempdf.set_index('lang', inplace=True)
|
22 |
+
tempdf = tempdf[['iso', 'text', tokenizer_name]]
|
23 |
+
tempdf.columns=['ISO', 'Text', 'Num Tokens']
|
24 |
+
tempdf.sort_values(by='ISO', inplace=True)
|
25 |
+
st.session_state.examplesdf = tempdf
|
26 |
+
|
27 |
# TODO allow new tokenizers from HF
|
28 |
tokenizer_names_to_test = [
|
29 |
"openai/gpt4",
|
|
|
65 |
languages = st.multiselect(
|
66 |
'Select languages',
|
67 |
options=sorted(val_data.lang.unique()),
|
68 |
+
default=['English', 'Spanish' ,'Chinese', 'Burmese'],
|
69 |
max_selections=6,
|
70 |
label_visibility='collapsed'
|
71 |
)
|
|
|
92 |
subset_df = val_data[val_data.lang.isin(languages)]
|
93 |
subset_data = [val_data[val_data.lang==_lang][tokenizer_name] for _lang in languages]
|
94 |
|
95 |
+
st.header('Compare tokenization in different languages')
|
96 |
fig = ff.create_distplot(subset_data, group_labels=languages, show_hist=show_hist)
|
97 |
|
98 |
fig.update_layout(
|
|
|
110 |
metric_cols[i].metric(_lang, int(np.median(subset_df[subset_df.lang==_lang][tokenizer_name])))
|
111 |
|
112 |
|
113 |
+
st.subheader('Example Texts')
|
114 |
+
|
115 |
+
reload_example_text_data()
|
116 |
+
if st.button("🔄 Refresh"):
|
117 |
+
reload_example_text_data()
|
118 |
+
|
119 |
+
st.dataframe(st.session_state.examplesdf) # Same as st.write(df)
|
120 |
+
|
121 |
+
|
122 |
+
|
123 |
+
|
124 |
+
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
with st.expander("About the project"):
|
130 |
+
st.write("The purpose of this project is to compare the tokenization length for different languages. For some tokenizers, tokenizing a message in one language may result in 10-20x more tokens than a comparable message in another language (e.g. try English vs. Burmese). This is part of a larger project of measuring inequality in NLP.")
|
131 |
|