tamang0000
wordings
d655e4d
import logging
# import os
import tiktoken
from transformers import AutoTokenizer
import gradio as gr
logger = logging.getLogger(__name__) # noqa
# hugging face
# hf_token = os.getenv('HUGGINGFACE_TOKEN')
# HfApi().login(token=hf_token)
def load_test_phrases(filename):
with open(f"./data/{filename}", "r", encoding="utf-8") as file:
return file.read().splitlines()
models = ["Xenova/claude-tokenizer", # Anthropic
"meta-llama/Llama-2-7b-chat-hf", # LLAMA-2
"beomi/llama-2-ko-7b", # LLAMA-2-ko
"ai4bharat/Airavata", # ARIVATA
"openaccess-ai-collective/tiny-mistral", # Mistral
"gpt-3.5-turbo", # GPT3.5
"meta-llama/Meta-Llama-3-8B-Instruct", # LLAMA-3
"CohereForAI/aya-23-8B", # AYA
"google/gemma-1.1-2b-it", # GEMMA
"gpt-4o", # GPT4o
"TWO/sutra-mlt256-v2", # SUTRA
"tamang0000/assamese-tokenizer-50k" # Assamese
]
test_phrase_set = [
"মই আজিৰ পাছত হ’ব লগা হাঁহিৰ বাবে ওলাই থাকিম",
"আমি চন্দ্ৰলৈ ৰকেট যাত্ৰাত আছোঁ",
"পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক", # Assamese
"আমাক পাঁচখন বাক্যৰে নিউট্ৰন বিকিৰণৰ বৰ্ণনা দিয়ক",
"মোৰ বন্ধুটোৱে চাৰিটা পুথি পঢ়িছে", # Assamese
"মোৰ ঘৰখন গাঁওখনৰ আটাইতকৈ বেছি ডাঙৰ", # Assamese
"আজিৰে পৰা মই সৰু সৰু কামবোৰ কৰি থাকিম", # Assamese
"তেওঁৰ মাতবোৰ আৰু শাৰীবোৰ সলনি হোৱা দেখি চমক লাগিল", # Assamese
]
test_phrase_set_long_1 = load_test_phrases('multilingualphrases01-as.txt')
test_phrase_set_long_2 = load_test_phrases('multilingualphrases02-as.txt')
# test_phrase_set_long_3 = load_test_phrases('multilingualphrases03.txt')
def generate_tokens_as_table(text):
table = []
for model in models:
if 'gpt' not in model:
tokenizer = AutoTokenizer.from_pretrained(model)
tokens = tokenizer.encode(text, add_special_tokens=False)
else:
tokenizer = tiktoken.encoding_for_model(model)
tokens = tokenizer.encode(text)
decoded = [tokenizer.decode([t]) for t in tokens]
table.append([model] + decoded)
return table
def generate_tokenizer_table(text):
if not text:
return []
token_counts = {model: 0 for model in models}
vocab_size = {model: 0 for model in models}
for model in models:
if 'gpt' not in model:
tokenizer = AutoTokenizer.from_pretrained(model)
vocab_size[model] = tokenizer.vocab_size
else:
tokenizer = tiktoken.encoding_for_model(model)
vocab_size[model] = tokenizer.n_vocab
token_counts[model] += len(tokenizer.encode(text))
word_count = len(text.split(' '))
output = []
for m in models:
row = [m, vocab_size[m], word_count, token_counts[m], f"{token_counts[m] / word_count:0.2f}"]
output.append(row)
return output
def generate_split_token_table(text):
if not text:
return gr.Dataframe()
table = generate_tokenizer_table(text)
return gr.Dataframe(
table,
headers=['tokenizer', 'v size', '#word', '#token', '#tokens/word'],
datatype=["str", "number", "str"],
row_count=len(models),
col_count=(5, "fixed"),
)
with gr.Blocks() as sutra_token_count:
gr.Markdown(
"""
# Assamese Tokenizer Specs & Stats.
## Tokenize paragraphs in multiple languages and compare token counts.
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison
Number of Tokens (The less he better)
""")
textbox = gr.Textbox(label="Input Text")
submit_button = gr.Button("Submit")
output = gr.Dataframe()
examples = [
[' '.join(test_phrase_set_long_1)],
[' '.join(test_phrase_set_long_2)],
# [' '.join(test_phrase_set_long_3)],
]
gr.Examples(examples=examples, inputs=[textbox])
submit_button.click(generate_split_token_table, inputs=[textbox], outputs=[output])
def generate_tokens_table(text):
table = generate_tokens_as_table(text)
cols = len(table[0])
return gr.Dataframe(
table,
headers=['model'] + [str(i) for i in range(cols - 1)],
row_count=2,
col_count=(cols, "fixed"),
)
with gr.Blocks() as sutra_tokenize:
gr.Markdown(
"""
# Assamese Tokenizer Sentence Inspector.
## Tokenize a sentence with various tokenizers and inspect how it's broken down.
Space inspired from [SUTRA](https://huggingface.co/spaces/TWO/sutra-tokenizer-comparison)
Number of Tokens (The less the better)
""")
textbox = gr.Textbox(label="Input Text")
submit_button = gr.Button("Submit")
output = gr.Dataframe()
examples = test_phrase_set
gr.Examples(examples=examples, inputs=[textbox])
submit_button.click(generate_tokens_table, inputs=[textbox], outputs=[output])
if __name__ == '__main__':
with gr.Blocks(analytics_enabled=False) as demo:
with gr.Row():
gr.Markdown(
"""
## <img src="https://sagartamang.com/img/favicon.png" height="100%"/>
"""
)
with gr.Row():
gr.TabbedInterface(
interface_list=[sutra_tokenize, sutra_token_count],
tab_names=["Tokenize Text", "Tokenize Paragraphs"]
)
demo.queue(default_concurrency_limit=5).launch(
server_name="0.0.0.0",
allowed_paths=["/"],
)