Spaces:
Runtime error
Runtime error
File size: 4,195 Bytes
daa27ce 3e67b95 6841f1c 0158b9f daa27ce 3e67b95 daa27ce b1e9d2d daa27ce 3e67b95 0158b9f daa27ce 3e67b95 daa27ce 6841f1c 52f4fdb 6841f1c daa27ce 45abc63 6841f1c 0158b9f 52f4fdb 0158b9f 45abc63 0158b9f 116574f daa27ce 6841f1c 0158b9f 9acdffa 6841f1c 9acdffa 6841f1c 9acdffa 6841f1c 9acdffa 6841f1c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 |
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
@st.cache_data
def load_data():
return pd.read_csv("dataset.csv")
def reload_example_text_data(selected_language, selected_tokenizers):
tempdf = val_data[val_data["lang"] == selected_language]
random_sample = tempdf.sample(n=1)
selected_text = random_sample["text"].iloc[0]
random_sample = random_sample[selected_tokenizers]
random_sample.columns = [f"{tokenizer}" for tokenizer in selected_tokenizers]
st.session_state.examplesdf = random_sample
return selected_text
val_data = load_data()
tokenizer_names_to_test = [
"openai/gpt4",
"Xenova/gpt-4o",
"Xenova/claude-tokenizer",
"CohereForAI/aya-101",
"meta-llama/Meta-Llama-3-70B",
"mistralai/Mixtral-8x22B-v0.1",
"google/gemma-7b",
"facebook/nllb-200-distilled-600M",
"xlm-roberta-base",
"bert-base-uncased",
"sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
"bigscience/bloom",
"StabilityAI/stablelm-base-alpha-7b",
"google/flan-t5-base",
"facebook/mbart-large-50",
"EleutherAI/gpt-neox-20b",
"simbolo-ai/multilingual-partial-syllable-tokenizer",
]
with st.sidebar:
st.title("Tokenizer Comparisons")
st.write(
"""
Explore the performance of various tokenizers on the Burmese language. This tool visualizes how different tokenizers process the same Burmese text, highlighting disparities in tokenization.
This project is inspired by the insights from "All languages are NOT created (tokenized) equal!" Read more about it in the original article on [Art Fish Intelligence](https://www.artfish.ai/p/all-languages-are-not-created-tokenized).
"""
)
all_tokenizers = st.checkbox("Select All Tokenizers")
if all_tokenizers:
selected_tokenizers = tokenizer_names_to_test
else:
selected_tokenizers = st.multiselect(
"Select tokenizers",
options=tokenizer_names_to_test,
default=[
"openai/gpt4",
"Xenova/gpt-4o",
"CohereForAI/aya-101",
"Xenova/claude-tokenizer",
],
label_visibility="collapsed",
)
links = [
(
f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})"
if tokenizer_name
not in ["openai/gpt4", "simbolo-ai/multilingual-partial-syllable-tokenizer"]
else f"[{tokenizer_name}](https://github.com/{tokenizer_name.split('/')[0]}/{tokenizer_name.split('/')[1] if tokenizer_name != 'openai/gpt4' else 'tiktoken'})"
)
for tokenizer_name in selected_tokenizers
]
link = "Tokenized using " + ", ".join(links)
st.markdown(link, unsafe_allow_html=True)
selected_text = reload_example_text_data("Burmese", selected_tokenizers)
st.subheader(f"**Sampled Text:** `{selected_text}`")
st.subheader("Number of Tokens")
st.table(st.session_state.examplesdf)
# Create a distribution plot for token density across selected tokenizers
import plotly.figure_factory as ff
if selected_tokenizers:
# Collecting data for all selected tokenizers
hist_data = [val_data[tokenizer].dropna() for tokenizer in selected_tokenizers]
# Creating the distplot with optional histogram
fig = ff.create_distplot(
hist_data, selected_tokenizers, show_hist=False, show_rug=False
)
fig.update_layout(
title="Token Distribution Density",
xaxis_title="Number of Tokens",
yaxis_title="Density",
height=500,
)
st.plotly_chart(fig, use_container_width=True)
tokenizer_to_num_tokens = {
name: val_data[name].tolist() for name in selected_tokenizers
}
fig = go.Figure()
for tokenizer_name in selected_tokenizers:
fig.add_trace(
go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
)
fig.update_layout(title="Token Count Variability")
st.plotly_chart(fig)
else:
st.error(
"No tokenizers selected. Please select at least one tokenizer to view the distribution plot."
)
|