Spaces:
Runtime error
Runtime error
Move the token results from sidebar
Browse files
app.py
CHANGED
@@ -93,59 +93,59 @@ with st.sidebar:
|
|
93 |
label_visibility="collapsed",
|
94 |
)
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
)
|
116 |
-
fig.update_layout(
|
117 |
-
title=f"Distribution of Number of Tokens for Selected Tokenizers",
|
118 |
-
xaxis_title="Tokenizer",
|
119 |
-
yaxis_title="Number of Tokens",
|
120 |
)
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
st.plotly_chart(fig)
|
139 |
-
elif selected_figure == "Scatterplot":
|
140 |
-
df = pd.DataFrame(tokenizer_to_num_tokens)
|
141 |
-
fig = px.scatter_matrix(
|
142 |
-
df,
|
143 |
-
dimensions=selected_tokenizers,
|
144 |
-
color_discrete_sequence=px.colors.qualitative.Plotly,
|
145 |
-
)
|
146 |
-
fig.update_layout(
|
147 |
-
title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
|
148 |
-
width=800,
|
149 |
-
height=800,
|
150 |
)
|
151 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
label_visibility="collapsed",
|
94 |
)
|
95 |
|
96 |
+
st.header("Example Text")
|
97 |
+
with st.spinner("Loading example text..."):
|
98 |
+
reload_example_text_data(selected_language)
|
99 |
+
st.table(st.session_state.examplesdf)
|
100 |
+
st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
|
101 |
|
102 |
+
tokenizer_to_num_tokens = defaultdict(list)
|
103 |
+
for _, row in tqdm.tqdm(val_data.iterrows(), total=len(val_data)):
|
104 |
+
text = row["text"]
|
105 |
+
for tokenizer_name in selected_tokenizers:
|
106 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
107 |
+
num_tokens = len(tokenizer(text)["input_ids"])
|
108 |
+
tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
|
109 |
|
110 |
+
if selected_figure == "Boxplot":
|
111 |
+
fig = go.Figure()
|
112 |
+
for tokenizer_name in selected_tokenizers:
|
113 |
+
fig.add_trace(
|
114 |
+
go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
|
|
|
|
|
|
|
|
|
|
|
115 |
)
|
116 |
+
fig.update_layout(
|
117 |
+
title=f"Distribution of Number of Tokens for Selected Tokenizers",
|
118 |
+
xaxis_title="Tokenizer",
|
119 |
+
yaxis_title="Number of Tokens",
|
120 |
+
)
|
121 |
+
st.plotly_chart(fig)
|
122 |
+
elif selected_figure == "Histogram":
|
123 |
+
fig = make_subplots(
|
124 |
+
rows=len(selected_tokenizers), cols=1, subplot_titles=selected_tokenizers
|
125 |
+
)
|
126 |
+
for i, tokenizer_name in enumerate(selected_tokenizers):
|
127 |
+
fig.add_trace(
|
128 |
+
go.Histogram(
|
129 |
+
x=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name
|
130 |
+
),
|
131 |
+
row=i + 1,
|
132 |
+
col=1,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
)
|
134 |
+
fig.update_layout(
|
135 |
+
height=200 * len(selected_tokenizers),
|
136 |
+
title_text="Histogram of Number of Tokens",
|
137 |
+
)
|
138 |
+
st.plotly_chart(fig)
|
139 |
+
elif selected_figure == "Scatterplot":
|
140 |
+
df = pd.DataFrame(tokenizer_to_num_tokens)
|
141 |
+
fig = px.scatter_matrix(
|
142 |
+
df,
|
143 |
+
dimensions=selected_tokenizers,
|
144 |
+
color_discrete_sequence=px.colors.qualitative.Plotly,
|
145 |
+
)
|
146 |
+
fig.update_layout(
|
147 |
+
title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
|
148 |
+
width=800,
|
149 |
+
height=800,
|
150 |
+
)
|
151 |
+
st.plotly_chart(fig)
|