Spaces:
Sleeping
Sleeping
File size: 6,774 Bytes
a5212f9 7dae6b7 a5212f9 7dae6b7 a5212f9 7dae6b7 a5212f9 7dae6b7 ce49ae8 7dae6b7 a5212f9 ce49ae8 a5212f9 7dae6b7 a5212f9 7dae6b7 a5212f9 ce49ae8 7dae6b7 a5212f9 7dae6b7 a5212f9 7dae6b7 a5212f9 7dae6b7 a5212f9 7dae6b7 2488d19 ce49ae8 7dae6b7 ce49ae8 7dae6b7 a5212f9 2488d19 ce49ae8 a5212f9 2488d19 ce49ae8 a5212f9 ce49ae8 a5212f9 7dae6b7 a5212f9 507d429 ce49ae8 a5212f9 507d429 7dae6b7 a5212f9 7dae6b7 507d429 a5212f9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
from transformers import AutoTokenizer
import gradio as gr
import random
checkpoint = "dslim/bert-base-NER"
checkpoints = [
checkpoint,
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"microsoft/phi-2",
"openai/whisper-large-v3",
"NousResearch/Nous-Hermes-2-Yi-34B",
"bert-base-cased"
]
placeholder = "Type anything in this text box and hit Tokenize!"
sequences = [
"The quick brown π¦ fox jumps over the lazy π dog!",
"How vexingly β© quick daft π¦ zebras jump?",
"Pack my π¦ box with five dozen π· liquor jugs.",
"The five π₯ boxing π§ββοΈ wizards jump quickly~",
"While making deep βοΈ excavations we found some quaint bronze π jewelry!",
"Whenever the π¦ fox jumped, the πΏοΈ squirrel gazed suspiciously...",
"We promptly π§ββοΈ judged antique ivory buckles for the next π prize."
]
def randomize_sequence():
return random.choice(sequences)
sequence = randomize_sequence
def load_vocab(target_model, current_model):
checkpoint = target_model
if target_model == current_model:
gr.Info(f"Tokenizer already loaded: {checkpoint}")
else:
load_tokenizer(checkpoint)
gr.Info(f"Tokenizer loaded: {checkpoint}")
vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
unk = next(iter(vocab))
vocab.pop(unk)
vocab_sorted = "\n".join(vocab)
vocab_size = len(vocab)
gr.Info(f"Tokenizer vocab size: {vocab_size}")
return checkpoint, vocab_size, unk, vocab_sorted
def load_tokenizer(checkpoint):
if not "tokenizer" in globals():
global tokenizer
if len(checkpoint) > 0:
try:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
except Exception as error:
gr.Warning("Unexpected error!")
raise gr.Error(f"{error}")
else:
return ValueError("Tokenizer cannot be empty!")
def tokenize_er(checkpoint, sequence):
try:
load_tokenizer(checkpoint)
tokens = tokenizer.tokenize(sequence)
ids = tokenizer.convert_tokens_to_ids(tokens)
token_id_pair = []
if len(tokens) == len(ids):
for i in range(len(ids)):
token_id_pair.append([tokens[i],ids[i]])
return token_id_pair
except NameError:
gr.Warning("Select Tokenizer before sequencing.")
return [[None, None]]
except Exception as error:
gr.Warning("Unexpected error!")
raise gr.Error(f"{error}")
def de_tokenize_er(checkpoint, pairs):
try:
load_tokenizer(checkpoint)
tokens = []
ids = []
for row in pairs:
tokens.append(row[0])
try:
ids.append(int(row[1]))
except:
ids.append(0)
tokens_ids= tokenizer.convert_tokens_to_ids(tokens)
decoded_tokens = tokenizer.decode(tokens_ids)
decoded_ids = tokenizer.decode(ids)
return tokens_ids, decoded_tokens, decoded_ids
except NameError:
gr.Warning("Tokenize sequence before decoding.")
return None, None, None
except Exception as error:
gr.Warning("Unexpected error!")
raise gr.Error(f"{error}")
with gr.Blocks() as frontend:
with gr.Row():
with gr.Column(scale=3):
gr.Markdown("# π Tokenizaminer\n### The Tokenizer Examiner, or the Tokeniza Miner... π΅οΈπ³οΈ\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\nNote how the Vocabulary ID lines up with the full Vocabulary index on the right β‘οΈ\n\nβ οΈ Loading the full vocabulary can take a few seconds and the browser might stutter.")
with gr.Row():
gr.Markdown("\n#### 1. Select Tokenizer\nSelect from the list or enter any model from π€ Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
with gr.Row():
input_checkpoint = gr.Dropdown(label="Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, show_label=False, container=False)
#btn_load_vocab = gr.Button(value="Load Vocabulary")
with gr.Row():
gr.Markdown("\n#### 2. Sequence & Tokenize")
with gr.Row():
input_sequence = gr.TextArea(label="Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True, show_label=False, container=False)
with gr.Row():
btn_tokenize = gr.Button(value="Tokenize!")
btn_random_seq = gr.Button(value="Randomize!")
with gr.Row():
gr.Markdown("\n#### 3. Decode\nYou can select and edit each cell individually - then hit Decode!")
with gr.Row():
token_id_pair = gr.DataFrame(col_count=(2,"fixed"), headers=["Token","Vocabulary ID"], value=[[None,0]], type="array", datatype=["str", "number"], height=400, interactive=True)
with gr.Row():
btn_decode = gr.Button(value="Decode")
btn_clear_pairs = gr.ClearButton(value="Clear Token/IDs", components=[token_id_pair])
with gr.Row():
with gr.Column():
output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False)
with gr.Column():
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
with gr.Column(scale=1):
with gr.Group():
gr.Markdown("### π² Tokenizer Data")
output_checkpoint = gr.Textbox(visible=False)
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
output_token_zero = gr.Textbox(label="Token 0", interactive=False)
output_vocab = gr.Code(label="Vocabulary IDs")
input_checkpoint.change(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True)
btn_tokenize.click(fn=tokenize_er, inputs=[input_checkpoint, input_sequence], outputs=[token_id_pair], queue=True)
btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
btn_decode.click(fn=de_tokenize_er, inputs=[input_checkpoint, token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids], queue=True)
frontend.load(fn=load_vocab, inputs=[input_checkpoint, output_checkpoint], outputs=[output_checkpoint, output_vocab_count, output_token_zero, output_vocab], queue=True)
frontend.launch() |