Spaces:

Taranosaurus
/

Tokenizaminer

Running

App Files Files Community

Taranosaurus commited on Jan 3, 2024

Commit

a5212f9

1 Parent(s): d222a38

Initial commit of the app.py

Browse files

Files changed (1) hide show

app.py +113 -0

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+from transformers import AutoTokenizer
+import gradio as gr
+import random
+checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+checkpoints = [
+    checkpoint,
+    "microsoft/phi-2",
+    "openai/whisper-large-v3",
+    "NousResearch/Nous-Hermes-2-Yi-34B",
+    "bert-base-cased"
+]
+placeholder = "Type anything in this text box and hit Tokenize!"
+sequences = [
+    "The quick brown 🦊 fox jumps over the lazy 🐕 dog!",
+    "How vexingly ⏩ quick daft 🦓 zebras jump?",
+    "Pack my 📦 box with five dozen 🍷 liquor jugs.",
+    "The five 🥊 boxing 🧙‍♂️ wizards jump quickly~",
+    "While making deep ⛏️ excavations we found some quaint bronze 💍 jewelry!",
+    "Whenever the 🦊 fox jumped, the 🐿️ squirrel gazed suspiciously...",
+    "We promptly 🧑‍⚖️ judged antique ivory buckles for the next 🏆 prize."
+    ]
+def randomize_sequence():
+    return random.choice(sequences)
+sequence = randomize_sequence
+def load_tokenizer(checkpoint):
+    if not "tokenizer" in globals():
+        global tokenizer
+        tokenizer = None
+    try:
+        tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+        vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
+        unk = next(iter(vocab))
+        vocab.pop(unk)
+        vocab_sorted = "\n".join(vocab)
+        vocab_size = len(vocab)
+        gr.Info(f"Tokenizer loaded '{checkpoint}' with vocab size: {vocab_size}")
+        #return checkpoint, vocab_size, vocab
+        return vocab_size, unk, vocab_sorted
+    except Exception as error:
+        gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
+        gr.Warning(f"{error}")
+        return None, None, None
+def tokenize_er(sequence):
+    try:
+        tokens = tokenizer.tokenize(sequence)
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        token_id_pair = []
+        if len(tokens) == len(ids):
+            for i in range(len(ids)):
+                token_id_pair.append([tokens[i],ids[i]])
+        return token_id_pair
+    except NameError:
+        gr.Warning("Load Tokenizer before sequencing.")
+        return [[None, None]]
+def de_tokenize_er(pairs):
+    try:
+        tokens = []
+        ids = []
+        for row in pairs:
+            tokens.append(row[0])
+            try:
+                ids.append(int(row[1]))
+            except:
+                ids.append(0)
+        tokens_ids= tokenizer.convert_tokens_to_ids(tokens)
+        decoded_tokens = tokenizer.decode(tokens_ids)
+        decoded_ids = tokenizer.decode(ids)
+        return tokens_ids, decoded_tokens, decoded_ids
+    except NameError:
+        gr.Warning("Tokenize sequence before decoding.")
+        return None, None, None
+with gr.Blocks() as frontend:
+    with gr.Row():
+        with gr.Column(scale=3):
+            gr.Markdown("# 🐇 Tokenizaminer\n\n### The Tokenizer Examiner... 🕵️🕳️\n\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\n\n## Instructions\n\n1. Load a tokenizer\n2. Type and Tokenize a sequence\n3. Manipulate it to see what happens!")
+            with gr.Group():
+                input_checkpoint = gr.Dropdown(label="1. Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, info="Select from the list or enter any model from 🤗 Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
+                btn_load_tokenizer = gr.Button(value="Load Tokenizer")
+            with gr.Row():
+                input_sequence = gr.TextArea(label="2. Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True)
+            with gr.Row():
+                    btn_tokenize = gr.Button(value="Tokenize!")
+                    btn_random_seq = gr.Button(value="Randomize!")
+            with gr.Row():
+                token_id_pair = gr.DataFrame(label="3. Decode", col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
+            with gr.Row():
+                btn_decode = gr.Button(value="Decode")
+            with gr.Row():
+                with gr.Column():
+                    output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
+                    output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False)
+                with gr.Column():
+                    output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
+        with gr.Column(scale=1):
+            with gr.Group():
+                output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
+                output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
+                output_vocab = gr.Code(label="Vocabulary")
+        btn_load_tokenizer.click(fn=load_tokenizer, inputs=[input_checkpoint], outputs=[output_vocab_count,output_unknown_token, output_vocab])
+        btn_tokenize.click(fn=tokenize_er, inputs=[input_sequence], outputs=[token_id_pair])
+        btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
+        btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
+frontend.launch()