Spaces:
Runtime error
Runtime error
Init
Browse files
app.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from collections import defaultdict
|
3 |
+
import tqdm
|
4 |
+
import transformers
|
5 |
+
from transformers import AutoTokenizer
|
6 |
+
import pandas as pd
|
7 |
+
import matplotlib.pyplot as plt
|
8 |
+
import seaborn as sns
|
9 |
+
import numpy as np
|
10 |
+
import plotly.figure_factory as ff
|
11 |
+
import plotly.express as px
|
12 |
+
from plotly.subplots import make_subplots
|
13 |
+
import plotly.graph_objects as go
|
14 |
+
import random, glob
|
15 |
+
|
16 |
+
|
17 |
+
@st.cache_data
|
18 |
+
def load_data():
|
19 |
+
return pd.read_csv("MassiveDatasetValidationData.csv")
|
20 |
+
|
21 |
+
|
22 |
+
def reload_example_text_data(language):
|
23 |
+
random_id = random.choice(val_data["id"])
|
24 |
+
tempdf = val_data[val_data["id"] == random_id]
|
25 |
+
tempdf = tempdf[["iso", "text", *selected_tokenizers]]
|
26 |
+
tempdf = tempdf[tempdf["iso"] == language]
|
27 |
+
tempdf.set_index("iso", inplace=True)
|
28 |
+
tempdf.columns = ["Text"] + [f"Num Tokens ({t})" for t in selected_tokenizers]
|
29 |
+
st.session_state.examplesdf = tempdf
|
30 |
+
|
31 |
+
|
32 |
+
tokenizer_names_to_test = [
|
33 |
+
"openai/gpt4",
|
34 |
+
"Xenova/gpt-4o",
|
35 |
+
"Xenova/claude-tokenizer",
|
36 |
+
"CohereForAI/aya-101",
|
37 |
+
"meta-llama/Meta-Llama-3-70B",
|
38 |
+
"mistralai/Mixtral-8x22B-v0.1",
|
39 |
+
"google/gemma-7b",
|
40 |
+
"facebook/nllb-200-distilled-600M",
|
41 |
+
"xlm-roberta-base",
|
42 |
+
"bert-base-uncased",
|
43 |
+
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
44 |
+
"bigscience/bloom",
|
45 |
+
"StabilityAI/stablelm-base-alpha-7b",
|
46 |
+
"google/flan-t5-base",
|
47 |
+
"facebook/mbart-large-50",
|
48 |
+
"EleutherAI/gpt-neox-20b",
|
49 |
+
]
|
50 |
+
|
51 |
+
with st.sidebar:
|
52 |
+
st.header("Comparing Tokenizers")
|
53 |
+
link = "This project compares the tokenization length for different tokenizers. Some tokenizers may result in significantly more tokens than others for the same text."
|
54 |
+
st.markdown(link)
|
55 |
+
|
56 |
+
st.header("Data Visualization")
|
57 |
+
st.subheader("Tokenizers")
|
58 |
+
selected_tokenizers = st.multiselect(
|
59 |
+
"Select tokenizers",
|
60 |
+
options=tokenizer_names_to_test,
|
61 |
+
default=["openai/gpt4", "Xenova/gpt-4o", "Xenova/claude-tokenizer"],
|
62 |
+
max_selections=6,
|
63 |
+
label_visibility="collapsed",
|
64 |
+
)
|
65 |
+
|
66 |
+
st.subheader("Data")
|
67 |
+
with st.spinner("Loading dataset..."):
|
68 |
+
val_data = load_data()
|
69 |
+
st.success(f"Data loaded: {len(val_data)}")
|
70 |
+
|
71 |
+
with st.expander("Data Source"):
|
72 |
+
st.write(
|
73 |
+
"The data in this figure is the validation set of the [Amazon Massive](https://huggingface.co/datasets/AmazonScience/massive/viewer/af-ZA/validation) dataset, which consists of 2033 short sentences and phrases translated into 51 different languages. Learn more about the dataset from [Amazon's blog post](https://www.amazon.science/blog/amazon-releases-51-language-dataset-for-language-understanding)"
|
74 |
+
)
|
75 |
+
|
76 |
+
st.subheader("Language")
|
77 |
+
language_options = sorted(val_data.lang.unique())
|
78 |
+
default_language_index = (
|
79 |
+
language_options.index("English") if "English" in language_options else 0
|
80 |
+
)
|
81 |
+
selected_language = st.selectbox(
|
82 |
+
"Select language",
|
83 |
+
options=language_options,
|
84 |
+
index=default_language_index,
|
85 |
+
label_visibility="collapsed",
|
86 |
+
)
|
87 |
+
|
88 |
+
st.subheader("Figure")
|
89 |
+
selected_figure = st.radio(
|
90 |
+
"Select figure type",
|
91 |
+
options=["Boxplot", "Histogram", "Scatterplot"],
|
92 |
+
index=0,
|
93 |
+
label_visibility="collapsed",
|
94 |
+
)
|
95 |
+
|
96 |
+
st.header("Example Text")
|
97 |
+
with st.spinner("Loading example text..."):
|
98 |
+
reload_example_text_data(selected_language)
|
99 |
+
st.table(st.session_state.examplesdf)
|
100 |
+
st.button("Reload", on_click=reload_example_text_data, args=(selected_language,))
|
101 |
+
|
102 |
+
tokenizer_to_num_tokens = defaultdict(list)
|
103 |
+
for _, row in tqdm.tqdm(val_data.iterrows(), total=len(val_data)):
|
104 |
+
text = row["text"]
|
105 |
+
for tokenizer_name in selected_tokenizers:
|
106 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
107 |
+
num_tokens = len(tokenizer(text)["input_ids"])
|
108 |
+
tokenizer_to_num_tokens[tokenizer_name].append(num_tokens)
|
109 |
+
|
110 |
+
if selected_figure == "Boxplot":
|
111 |
+
fig = go.Figure()
|
112 |
+
for tokenizer_name in selected_tokenizers:
|
113 |
+
fig.add_trace(
|
114 |
+
go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
|
115 |
+
)
|
116 |
+
fig.update_layout(
|
117 |
+
title=f"Distribution of Number of Tokens for Selected Tokenizers",
|
118 |
+
xaxis_title="Tokenizer",
|
119 |
+
yaxis_title="Number of Tokens",
|
120 |
+
)
|
121 |
+
st.plotly_chart(fig)
|
122 |
+
elif selected_figure == "Histogram":
|
123 |
+
fig = make_subplots(
|
124 |
+
rows=len(selected_tokenizers), cols=1, subplot_titles=selected_tokenizers
|
125 |
+
)
|
126 |
+
for i, tokenizer_name in enumerate(selected_tokenizers):
|
127 |
+
fig.add_trace(
|
128 |
+
go.Histogram(
|
129 |
+
x=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name
|
130 |
+
),
|
131 |
+
row=i + 1,
|
132 |
+
col=1,
|
133 |
+
)
|
134 |
+
fig.update_layout(
|
135 |
+
height=200 * len(selected_tokenizers),
|
136 |
+
title_text="Histogram of Number of Tokens",
|
137 |
+
)
|
138 |
+
st.plotly_chart(fig)
|
139 |
+
elif selected_figure == "Scatterplot":
|
140 |
+
df = pd.DataFrame(tokenizer_to_num_tokens)
|
141 |
+
fig = px.scatter_matrix(
|
142 |
+
df,
|
143 |
+
dimensions=selected_tokenizers,
|
144 |
+
color_discrete_sequence=px.colors.qualitative.Plotly,
|
145 |
+
)
|
146 |
+
fig.update_layout(
|
147 |
+
title=f"Scatterplot Matrix of Number of Tokens for Selected Tokenizers",
|
148 |
+
width=800,
|
149 |
+
height=800,
|
150 |
+
)
|
151 |
+
st.plotly_chart(fig)
|