Spaces:
Runtime error
Runtime error
Format the output
Browse files
app.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
-
import random
|
4 |
import plotly.graph_objects as go
|
5 |
-
|
6 |
-
import plotly.express as px
|
7 |
|
8 |
|
9 |
@st.cache_data
|
@@ -12,18 +10,13 @@ def load_data():
|
|
12 |
|
13 |
|
14 |
def reload_example_text_data(selected_language, selected_tokenizers):
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
tempdf.columns = ["ISO", "Text"] + [
|
23 |
-
f"Num Tokens ({tokenizer})" for tokenizer in selected_tokenizers
|
24 |
-
]
|
25 |
-
tempdf.sort_values(by="ISO", inplace=True)
|
26 |
-
st.session_state.examplesdf = tempdf
|
27 |
|
28 |
|
29 |
val_data = load_data()
|
@@ -53,6 +46,17 @@ with st.sidebar:
|
|
53 |
default=["openai/gpt4", "Xenova/gpt-4o"],
|
54 |
label_visibility="collapsed",
|
55 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
language_options = sorted(val_data["lang"].unique())
|
57 |
selected_language = st.selectbox(
|
58 |
"Select language",
|
@@ -60,54 +64,43 @@ with st.sidebar:
|
|
60 |
index=language_options.index("English") if "English" in language_options else 0,
|
61 |
label_visibility="collapsed",
|
62 |
)
|
63 |
-
selected_figure = st.selectbox(
|
64 |
-
"Select Plot Type",
|
65 |
-
options=["Boxplot", "Histogram", "Scatterplot"],
|
66 |
-
label_visibility="collapsed",
|
67 |
-
)
|
68 |
|
69 |
-
|
70 |
-
|
|
|
71 |
st.table(st.session_state.examplesdf)
|
72 |
-
st.button(
|
73 |
-
"Reload",
|
74 |
-
on_click=reload_example_text_data,
|
75 |
-
args=(selected_language, selected_tokenizers),
|
76 |
-
)
|
77 |
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
)
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
col=1,
|
101 |
-
)
|
102 |
-
fig.update_layout(
|
103 |
-
height=200 * len(selected_tokenizers),
|
104 |
-
title_text="Histogram of Number of Tokens",
|
105 |
-
)
|
106 |
-
st.plotly_chart(fig)
|
107 |
-
elif selected_figure == "Scatterplot":
|
108 |
-
df = pd.DataFrame(tokenizer_to_num_tokens)
|
109 |
-
fig = px.scatter_matrix(df, dimensions=selected_tokenizers)
|
110 |
-
fig.update_layout(
|
111 |
-
title="Scatterplot Matrix of Number of Tokens for Selected Tokenizers"
|
112 |
-
)
|
113 |
-
st.plotly_chart(fig)
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
|
|
3 |
import plotly.graph_objects as go
|
4 |
+
import numpy as np
|
|
|
5 |
|
6 |
|
7 |
@st.cache_data
|
|
|
10 |
|
11 |
|
12 |
def reload_example_text_data(selected_language, selected_tokenizers):
|
13 |
+
tempdf = val_data[val_data["lang"] == selected_language]
|
14 |
+
random_sample = tempdf.sample(n=1)
|
15 |
+
selected_text = random_sample["text"].iloc[0]
|
16 |
+
random_sample = random_sample[selected_tokenizers]
|
17 |
+
random_sample.columns = [f"{tokenizer}" for tokenizer in selected_tokenizers]
|
18 |
+
st.session_state.examplesdf = random_sample
|
19 |
+
return selected_text
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
val_data = load_data()
|
|
|
46 |
default=["openai/gpt4", "Xenova/gpt-4o"],
|
47 |
label_visibility="collapsed",
|
48 |
)
|
49 |
+
links = [
|
50 |
+
(
|
51 |
+
f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})"
|
52 |
+
if tokenizer_name != "openai/gpt4"
|
53 |
+
else f"[{tokenizer_name}](https://github.com/openai/tiktoken)"
|
54 |
+
)
|
55 |
+
for tokenizer_name in selected_tokenizers
|
56 |
+
]
|
57 |
+
link = "Tokenized using " + ", ".join(links)
|
58 |
+
st.markdown(link, unsafe_allow_html=True)
|
59 |
+
|
60 |
language_options = sorted(val_data["lang"].unique())
|
61 |
selected_language = st.selectbox(
|
62 |
"Select language",
|
|
|
64 |
index=language_options.index("English") if "English" in language_options else 0,
|
65 |
label_visibility="collapsed",
|
66 |
)
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
+
selected_text = reload_example_text_data(selected_language, selected_tokenizers)
|
69 |
+
st.subheader(f"**Sampled Text:** `{selected_text}`")
|
70 |
+
st.subheader("Number of Tokens")
|
71 |
st.table(st.session_state.examplesdf)
|
|
|
|
|
|
|
|
|
|
|
72 |
|
73 |
+
# Calculate metrics for each tokenizer
|
74 |
+
tokenizer_metrics = {}
|
75 |
+
for tokenizer in selected_tokenizers:
|
76 |
+
tokens = val_data[tokenizer].dropna()
|
77 |
+
median = np.median(tokens)
|
78 |
+
min_tokens = np.min(tokens)
|
79 |
+
max_tokens = np.max(tokens)
|
80 |
+
std_dev = np.std(tokens)
|
81 |
+
tokenizer_metrics[tokenizer] = {
|
82 |
+
"Median": median,
|
83 |
+
"Min": min_tokens,
|
84 |
+
"Max": max_tokens,
|
85 |
+
"Range": max_tokens - min_tokens,
|
86 |
+
"Standard Deviation": std_dev,
|
87 |
+
}
|
88 |
|
89 |
+
# Display metrics
|
90 |
+
st.subheader("Tokenizer Metrics")
|
91 |
+
st.json(tokenizer_metrics)
|
92 |
+
|
93 |
+
# Plot for top tokenizers by median token length
|
94 |
+
sorted_tokenizers = sorted(tokenizer_metrics.items(), key=lambda x: x[1]["Median"])
|
95 |
+
shortest_median = sorted_tokenizers[:5]
|
96 |
+
longest_median = sorted_tokenizers[-5:]
|
97 |
+
|
98 |
+
fig = go.Figure()
|
99 |
+
for name, metrics in shortest_median + longest_median:
|
100 |
+
fig.add_trace(go.Bar(x=[name], y=[metrics["Median"]], name=name))
|
101 |
+
fig.update_layout(
|
102 |
+
title="Top Tokenizers by Shortest and Longest Median Token Length",
|
103 |
+
xaxis_title="Tokenizer",
|
104 |
+
yaxis_title="Median Token Length",
|
105 |
+
)
|
106 |
+
st.plotly_chart(fig)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|