File size: 4,195 Bytes
daa27ce
 
3e67b95
6841f1c
0158b9f
daa27ce
 
3e67b95
daa27ce
b1e9d2d
daa27ce
 
3e67b95
0158b9f
 
 
 
 
 
 
daa27ce
 
3e67b95
daa27ce
6841f1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52f4fdb
6841f1c
 
daa27ce
45abc63
 
 
 
 
 
 
 
 
 
6841f1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0158b9f
 
 
52f4fdb
 
 
0158b9f
 
 
 
 
 
45abc63
0158b9f
 
116574f
daa27ce
6841f1c
 
0158b9f
9acdffa
 
 
6841f1c
9acdffa
 
 
 
 
 
 
 
 
 
 
6841f1c
9acdffa
 
 
6841f1c
9acdffa
 
 
 
 
 
 
 
 
 
6841f1c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np


@st.cache_data
def load_data():
    return pd.read_csv("dataset.csv")


def reload_example_text_data(selected_language, selected_tokenizers):
    tempdf = val_data[val_data["lang"] == selected_language]
    random_sample = tempdf.sample(n=1)
    selected_text = random_sample["text"].iloc[0]
    random_sample = random_sample[selected_tokenizers]
    random_sample.columns = [f"{tokenizer}" for tokenizer in selected_tokenizers]
    st.session_state.examplesdf = random_sample
    return selected_text


val_data = load_data()

tokenizer_names_to_test = [
    "openai/gpt4",
    "Xenova/gpt-4o",
    "Xenova/claude-tokenizer",
    "CohereForAI/aya-101",
    "meta-llama/Meta-Llama-3-70B",
    "mistralai/Mixtral-8x22B-v0.1",
    "google/gemma-7b",
    "facebook/nllb-200-distilled-600M",
    "xlm-roberta-base",
    "bert-base-uncased",
    "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    "bigscience/bloom",
    "StabilityAI/stablelm-base-alpha-7b",
    "google/flan-t5-base",
    "facebook/mbart-large-50",
    "EleutherAI/gpt-neox-20b",
    "simbolo-ai/multilingual-partial-syllable-tokenizer",
]

with st.sidebar:

    st.title("Tokenizer Comparisons")
    st.write(
        """
    Explore the performance of various tokenizers on the Burmese language. This tool visualizes how different tokenizers process the same Burmese text, highlighting disparities in tokenization.
    
    This project is inspired by the insights from "All languages are NOT created (tokenized) equal!" Read more about it in the original article on [Art Fish Intelligence](https://www.artfish.ai/p/all-languages-are-not-created-tokenized).
    """
    )

    all_tokenizers = st.checkbox("Select All Tokenizers")
    if all_tokenizers:
        selected_tokenizers = tokenizer_names_to_test
    else:
        selected_tokenizers = st.multiselect(
            "Select tokenizers",
            options=tokenizer_names_to_test,
            default=[
                "openai/gpt4",
                "Xenova/gpt-4o",
                "CohereForAI/aya-101",
                "Xenova/claude-tokenizer",
            ],
            label_visibility="collapsed",
        )
    links = [
        (
            f"[{tokenizer_name}](https://huggingface.co/{tokenizer_name})"
            if tokenizer_name
            not in ["openai/gpt4", "simbolo-ai/multilingual-partial-syllable-tokenizer"]
            else f"[{tokenizer_name}](https://github.com/{tokenizer_name.split('/')[0]}/{tokenizer_name.split('/')[1] if tokenizer_name != 'openai/gpt4' else 'tiktoken'})"
        )
        for tokenizer_name in selected_tokenizers
    ]
    link = "Tokenized using " + ", ".join(links)
    st.markdown(link, unsafe_allow_html=True)

selected_text = reload_example_text_data("Burmese", selected_tokenizers)
st.subheader(f"**Sampled Text:** `{selected_text}`")
st.subheader("Number of Tokens")
st.table(st.session_state.examplesdf)

# Create a distribution plot for token density across selected tokenizers
import plotly.figure_factory as ff

if selected_tokenizers:
    # Collecting data for all selected tokenizers
    hist_data = [val_data[tokenizer].dropna() for tokenizer in selected_tokenizers]

    # Creating the distplot with optional histogram
    fig = ff.create_distplot(
        hist_data, selected_tokenizers, show_hist=False, show_rug=False
    )
    fig.update_layout(
        title="Token Distribution Density",
        xaxis_title="Number of Tokens",
        yaxis_title="Density",
        height=500,
    )
    st.plotly_chart(fig, use_container_width=True)

    tokenizer_to_num_tokens = {
        name: val_data[name].tolist() for name in selected_tokenizers
    }

    fig = go.Figure()
    for tokenizer_name in selected_tokenizers:
        fig.add_trace(
            go.Box(y=tokenizer_to_num_tokens[tokenizer_name], name=tokenizer_name)
        )
    fig.update_layout(title="Token Count Variability")
    st.plotly_chart(fig)
else:
    st.error(
        "No tokenizers selected. Please select at least one tokenizer to view the distribution plot."
    )