|
import logging |
|
|
|
import tiktoken |
|
from transformers import AutoTokenizer |
|
|
|
import gradio as gr |
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
|
|
|
def load_test_phrases(filename): |
|
with open(f"./data/{filename}", "r", encoding="utf-8") as file: |
|
return file.read().splitlines() |
|
|
|
|
|
models = ["Xenova/claude-tokenizer", |
|
"meta-llama/Llama-2-7b-chat-hf", |
|
"beomi/llama-2-ko-7b", |
|
"ai4bharat/Airavata", |
|
"openaccess-ai-collective/tiny-mistral", |
|
"gpt-3.5-turbo", |
|
"meta-llama/Meta-Llama-3-8B-Instruct", |
|
"CohereForAI/aya-23-8B", |
|
"google/gemma-1.1-2b-it", |
|
"gpt-4o", |
|
"TWO/sutra-mlt256-v2", |
|
"tamang0000/assamese-tokenizer-50k" |
|
] |
|
|
|
test_phrase_set = [ |
|
"মই আজিৰ পাছত হ’ব লগা হাঁহিৰ বাবে ওলাই থাকিম", |
|
"আমি চন্দ্ৰলৈ ৰকেট যাত্ৰাত আছোঁ", |
|
|
|
"পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক", |
|
"আমাক পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক", |
|
|
|
"মোৰ বন্ধুটোৱে চাৰিটা পুথি পঢ়িছে", |
|
"মোৰ ঘৰখন গাঁওখনৰ আটাইতকৈ বেছি ডাঙৰ", |
|
"আজিৰে পৰা মই সৰু সৰু কামবোৰ কৰি থাকিম", |
|
"তেওঁৰ মাতবোৰ আৰু শাৰীবোৰ সলনি হোৱা দেখি চমক লাগিল", |
|
] |
|
|
|
test_phrase_set_long_1 = load_test_phrases('multilingualphrases01-as.txt') |
|
test_phrase_set_long_2 = load_test_phrases('multilingualphrases02-as.txt') |
|
|
|
|
|
|
|
def generate_tokens_as_table(text): |
|
table = [] |
|
for model in models: |
|
if 'gpt' not in model: |
|
tokenizer = AutoTokenizer.from_pretrained(model) |
|
tokens = tokenizer.encode(text, add_special_tokens=False) |
|
else: |
|
tokenizer = tiktoken.encoding_for_model(model) |
|
tokens = tokenizer.encode(text) |
|
decoded = [tokenizer.decode([t]) for t in tokens] |
|
table.append([model] + decoded) |
|
return table |
|
|
|
|
|
def generate_tokenizer_table(text): |
|
if not text: |
|
return [] |
|
|
|
token_counts = {model: 0 for model in models} |
|
vocab_size = {model: 0 for model in models} |
|
|
|
for model in models: |
|
if 'gpt' not in model: |
|
tokenizer = AutoTokenizer.from_pretrained(model) |
|
vocab_size[model] = tokenizer.vocab_size |
|
else: |
|
tokenizer = tiktoken.encoding_for_model(model) |
|
vocab_size[model] = tokenizer.n_vocab |
|
|
|
token_counts[model] += len(tokenizer.encode(text)) |
|
|
|
word_count = len(text.split(' ')) |
|
|
|
output = [] |
|
for m in models: |
|
row = [m, vocab_size[m], word_count, token_counts[m], f"{token_counts[m] / word_count:0.2f}"] |
|
output.append(row) |
|
|
|
return output |
|
|
|
|
|
def generate_split_token_table(text): |
|
if not text: |
|
return gr.Dataframe() |
|
|
|
table = generate_tokenizer_table(text) |
|
return gr.Dataframe( |
|
table, |
|
headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'], |
|
datatype=["str", "number", "str"], |
|
row_count=len(models), |
|
col_count=(5, "fixed"), |
|
) |
|
|
|
|
|
with gr.Blocks() as sutra_token_count: |
|
gr.Markdown( |
|
""" |
|
# Assamese Tokenizer Specs & Stats. |
|
## Tokenize paragraphs in multiple languages and compare token counts. |
|
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison |
|
Number of Tokens (The less he better) |
|
""") |
|
textbox = gr.Textbox(label="Input Text") |
|
submit_button = gr.Button("Submit") |
|
output = gr.Dataframe() |
|
examples = [ |
|
[' '.join(test_phrase_set_long_1)], |
|
[' '.join(test_phrase_set_long_2)], |
|
|
|
] |
|
gr.Examples(examples=examples, inputs=[textbox]) |
|
submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output]) |
|
|
|
|
|
def generate_tokens_table(text): |
|
table = generate_tokens_as_table(text) |
|
cols = len(table[0]) |
|
return gr.Dataframe( |
|
table, |
|
headers=['model'] + [str(i) for i in range(cols - 1)], |
|
row_count=2, |
|
col_count=(cols, "fixed"), |
|
) |
|
|
|
|
|
with gr.Blocks() as sutra_tokenize: |
|
gr.Markdown( |
|
""" |
|
# Assamese Tokenizer Sentence Inspector. |
|
## Tokenize a sentence with various tokenizers and inspect how it's broken down. |
|
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison) |
|
Number of Tokens (The less the better) |
|
""") |
|
textbox = gr.Textbox(label="Input Text") |
|
submit_button = gr.Button("Submit") |
|
output = gr.Dataframe() |
|
examples = test_phrase_set |
|
gr.Examples(examples=examples, inputs=[textbox]) |
|
submit_button.click(generate_tokens_table, inputs=[textbox], outputs=[output]) |
|
|
|
|
|
if __name__ == '__main__': |
|
with gr.Blocks(analytics_enabled=False) as demo: |
|
with gr.Row(): |
|
gr.Markdown( |
|
""" |
|
## <img src="https://sagartamang.com/img/favicon.png" height="100%"/> |
|
""" |
|
) |
|
with gr.Row(): |
|
gr.TabbedInterface( |
|
interface_list=[sutra_tokenize, sutra_token_count], |
|
tab_names=["Tokenize Text", "Tokenize Paragraphs"] |
|
) |
|
|
|
demo.queue(default_concurrency_limit=5).launch( |
|
server_name="0.0.0.0", |
|
allowed_paths=["/"], |
|
) |
|
|