Spaces:

Taranosaurus
/

Tokenizaminer

Running

File size: 5,972 Bytes

a5212f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce49ae8
 
 
 
 
a5212f9
 
 
 
 
ce49ae8
a5212f9
 
 
 
 
 
ce49ae8
 
a5212f9
 
 
 
 
 
 
ce49ae8
a5212f9
ce49ae8
 
a5212f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce49ae8
2488d19
ce49ae8
a5212f9
ce49ae8
a5212f9
2488d19
 
ce49ae8
a5212f9
 
 
 
2488d19
 
ce49ae8
a5212f9
 
ce49ae8
a5212f9
 
 
 
 
 
 
 
ce49ae8
a5212f9
 
ce49ae8
a5212f9
ce49ae8
a5212f9

from transformers import AutoTokenizer
import gradio as gr
import random

checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
checkpoints = [
    checkpoint,
    "microsoft/phi-2",
    "openai/whisper-large-v3",
    "NousResearch/Nous-Hermes-2-Yi-34B",
    "bert-base-cased"
]

placeholder = "Type anything in this text box and hit Tokenize!"
sequences = [
    "The quick brown 🦊 fox jumps over the lazy 🐕 dog!",
    "How vexingly ⏩ quick daft 🦓 zebras jump?",
    "Pack my 📦 box with five dozen 🍷 liquor jugs.",
    "The five 🥊 boxing 🧙‍♂️ wizards jump quickly~",
    "While making deep ⛏️ excavations we found some quaint bronze 💍 jewelry!",
    "Whenever the 🦊 fox jumped, the 🐿️ squirrel gazed suspiciously...",
    "We promptly 🧑‍⚖️ judged antique ivory buckles for the next 🏆 prize."
    ]

def randomize_sequence():
    return random.choice(sequences)

sequence = randomize_sequence

def load_tokenizer(checkpoint):
    if not "tokenizer" in globals():
        global tokenizer
        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    try:
        if checkpoint == tokenizer.name_or_path:
            gr.Info(f"Tokenizer already loaded '{checkpoint}'")
        else:
            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
        unk = next(iter(vocab))
        vocab.pop(unk)
        vocab_sorted = "\n".join(vocab)
        vocab_size = len(vocab)
        gr.Info(f"Tokenizer vocab size: {vocab_size}")
        return vocab_size, unk, vocab_sorted
    except Exception as error:
        gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
        gr.Warning(f"{error}")
        return None, None, None

def tokenize_er(checkpoint, sequence):
    vocab_size, unk, vocab_sorted = load_tokenizer(checkpoint)
    try:
        tokens = tokenizer.tokenize(sequence)
        ids = tokenizer.convert_tokens_to_ids(tokens)
        token_id_pair = []
        if len(tokens) == len(ids):
            for i in range(len(ids)):
                token_id_pair.append([tokens[i],ids[i]])
        return token_id_pair, vocab_size, unk, vocab_sorted
    except NameError:
        gr.Warning("Select Tokenizer before sequencing.")
        return [[None, None]], None, None, None

def de_tokenize_er(pairs):
    try:
        tokens = []
        ids = []
        for row in pairs:
            tokens.append(row[0])
            try:
                ids.append(int(row[1]))
            except:
                ids.append(0)
        tokens_ids= tokenizer.convert_tokens_to_ids(tokens)
        decoded_tokens = tokenizer.decode(tokens_ids)
        decoded_ids = tokenizer.decode(ids)
        return tokens_ids, decoded_tokens, decoded_ids
    except NameError:
        gr.Warning("Tokenize sequence before decoding.")
        return None, None, None

with gr.Blocks() as frontend:
    with gr.Row():
        with gr.Column(scale=3):
            gr.Markdown("# 🐇 Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... 🕵️🕳️\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right ➡️\n\n⚠️ Loading the vocabulary can take a few seconds.")
            with gr.Row():
                gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
            with gr.Group():
                input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
            with gr.Row():
                gr.Markdown("\n#### 2. Sequence & Tokenize")
            with gr.Row():
                input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False)
            with gr.Row():
                    btn_tokenize = gr.Button(value="Tokenize!")
                    btn_random_seq = gr.Button(value="Randomize!")
            with gr.Row():
                gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
            with gr.Row():
                token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True)
            with gr.Row():
                btn_decode = gr.Button(value="Decode")
                btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair])
            with gr.Row():
                with gr.Column():
                    output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
                    output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False)
                with gr.Column():
                    output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
        with gr.Column(scale=1):
            with gr.Group():
                gr.Markdown("\n#### 🎲 Tokenizer Data")
                output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
                output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
                output_vocab = gr.Code(label="Vocabulary IDs")

        btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair, output_vocab_count,output_unknown_token, output_vocab], queue=True)
        btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
        btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])

frontend.launch()