Spaces:
Running
Running
Commit
Β·
a5212f9
1
Parent(s):
d222a38
Initial commit of the app.py
Browse files
app.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import AutoTokenizer
|
2 |
+
import gradio as gr
|
3 |
+
import random
|
4 |
+
|
5 |
+
checkpoint = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
|
6 |
+
checkpoints = [
|
7 |
+
checkpoint,
|
8 |
+
"microsoft/phi-2",
|
9 |
+
"openai/whisper-large-v3",
|
10 |
+
"NousResearch/Nous-Hermes-2-Yi-34B",
|
11 |
+
"bert-base-cased"
|
12 |
+
]
|
13 |
+
|
14 |
+
placeholder = "Type anything in this text box and hit Tokenize!"
|
15 |
+
sequences = [
|
16 |
+
"The quick brown π¦ fox jumps over the lazy π dog!",
|
17 |
+
"How vexingly β© quick daft π¦ zebras jump?",
|
18 |
+
"Pack my π¦ box with five dozen π· liquor jugs.",
|
19 |
+
"The five π₯ boxing π§ββοΈ wizards jump quickly~",
|
20 |
+
"While making deep βοΈ excavations we found some quaint bronze π jewelry!",
|
21 |
+
"Whenever the π¦ fox jumped, the πΏοΈ squirrel gazed suspiciously...",
|
22 |
+
"We promptly π§ββοΈ judged antique ivory buckles for the next π prize."
|
23 |
+
]
|
24 |
+
|
25 |
+
def randomize_sequence():
|
26 |
+
return random.choice(sequences)
|
27 |
+
|
28 |
+
sequence = randomize_sequence
|
29 |
+
|
30 |
+
def load_tokenizer(checkpoint):
|
31 |
+
if not "tokenizer" in globals():
|
32 |
+
global tokenizer
|
33 |
+
tokenizer = None
|
34 |
+
try:
|
35 |
+
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
|
36 |
+
vocab = dict(sorted(tokenizer.vocab.items(), key=lambda item: item[1]))
|
37 |
+
unk = next(iter(vocab))
|
38 |
+
vocab.pop(unk)
|
39 |
+
vocab_sorted = "\n".join(vocab)
|
40 |
+
vocab_size = len(vocab)
|
41 |
+
gr.Info(f"Tokenizer loaded '{checkpoint}' with vocab size: {vocab_size}")
|
42 |
+
#return checkpoint, vocab_size, vocab
|
43 |
+
return vocab_size, unk, vocab_sorted
|
44 |
+
except Exception as error:
|
45 |
+
gr.Warning(f"An unexpected error occurred while loading the Tokenizer.")
|
46 |
+
gr.Warning(f"{error}")
|
47 |
+
return None, None, None
|
48 |
+
|
49 |
+
def tokenize_er(sequence):
|
50 |
+
try:
|
51 |
+
tokens = tokenizer.tokenize(sequence)
|
52 |
+
ids = tokenizer.convert_tokens_to_ids(tokens)
|
53 |
+
token_id_pair = []
|
54 |
+
if len(tokens) == len(ids):
|
55 |
+
for i in range(len(ids)):
|
56 |
+
token_id_pair.append([tokens[i],ids[i]])
|
57 |
+
return token_id_pair
|
58 |
+
except NameError:
|
59 |
+
gr.Warning("Load Tokenizer before sequencing.")
|
60 |
+
return [[None, None]]
|
61 |
+
|
62 |
+
def de_tokenize_er(pairs):
|
63 |
+
try:
|
64 |
+
tokens = []
|
65 |
+
ids = []
|
66 |
+
for row in pairs:
|
67 |
+
tokens.append(row[0])
|
68 |
+
try:
|
69 |
+
ids.append(int(row[1]))
|
70 |
+
except:
|
71 |
+
ids.append(0)
|
72 |
+
tokens_ids= tokenizer.convert_tokens_to_ids(tokens)
|
73 |
+
decoded_tokens = tokenizer.decode(tokens_ids)
|
74 |
+
decoded_ids = tokenizer.decode(ids)
|
75 |
+
return tokens_ids, decoded_tokens, decoded_ids
|
76 |
+
except NameError:
|
77 |
+
gr.Warning("Tokenize sequence before decoding.")
|
78 |
+
return None, None, None
|
79 |
+
|
80 |
+
with gr.Blocks() as frontend:
|
81 |
+
with gr.Row():
|
82 |
+
with gr.Column(scale=3):
|
83 |
+
gr.Markdown("# π Tokenizaminer\n\n### The Tokenizer Examiner... π΅οΈπ³οΈ\n\nThe purpose of this tool is to examine the vocabulary and tokens of a models tokenizer and play with the results.\n\n## Instructions\n\n1. Load a tokenizer\n2. Type and Tokenize a sequence\n3. Manipulate it to see what happens!")
|
84 |
+
with gr.Group():
|
85 |
+
input_checkpoint = gr.Dropdown(label="1. Tokenizer", choices=checkpoints, value=checkpoint, allow_custom_value=True, info="Select from the list or enter any model from π€ Hugging Face Models, it will only download the Tokenizer data! Image models won't work here.")
|
86 |
+
btn_load_tokenizer = gr.Button(value="Load Tokenizer")
|
87 |
+
with gr.Row():
|
88 |
+
input_sequence = gr.TextArea(label="2. Sequence", value=sequence, placeholder=placeholder, lines=3, interactive=True)
|
89 |
+
with gr.Row():
|
90 |
+
btn_tokenize = gr.Button(value="Tokenize!")
|
91 |
+
btn_random_seq = gr.Button(value="Randomize!")
|
92 |
+
with gr.Row():
|
93 |
+
token_id_pair = gr.DataFrame(label="3. Decode", col_count=(2,"fixed"), headers=["Token","ID"], type="array", datatype=["str", "number"], height=400, interactive=True)
|
94 |
+
with gr.Row():
|
95 |
+
btn_decode = gr.Button(value="Decode")
|
96 |
+
with gr.Row():
|
97 |
+
with gr.Column():
|
98 |
+
output_decoded_token_ids = gr.TextArea(label="Re-encoded Tokens", interactive=False)
|
99 |
+
output_decoded_tokens = gr.TextArea(label="Decoded Re-encoded Tokens", interactive=False)
|
100 |
+
with gr.Column():
|
101 |
+
output_decoded_ids = gr.TextArea(label="Decoded IDs", interactive=False)
|
102 |
+
with gr.Column(scale=1):
|
103 |
+
with gr.Group():
|
104 |
+
output_vocab_count = gr.Number(label="Vocab Size", interactive=False)
|
105 |
+
output_unknown_token = gr.Textbox(label="Unknown Token", interactive=False)
|
106 |
+
output_vocab = gr.Code(label="Vocabulary")
|
107 |
+
|
108 |
+
btn_load_tokenizer.click(fn=load_tokenizer, inputs=[input_checkpoint], outputs=[output_vocab_count,output_unknown_token, output_vocab])
|
109 |
+
btn_tokenize.click(fn=tokenize_er, inputs=[input_sequence], outputs=[token_id_pair])
|
110 |
+
btn_random_seq.click(fn=randomize_sequence, inputs=[], outputs=[input_sequence])
|
111 |
+
btn_decode.click(fn=de_tokenize_er, inputs=[token_id_pair], outputs=[output_decoded_token_ids,output_decoded_tokens, output_decoded_ids])
|
112 |
+
|
113 |
+
frontend.launch()
|