File size: 1,640 Bytes
3e8ed31
 
89002be
3e8ed31
42d7cb6
89002be
3e8ed31
 
42d7cb6
 
3e8ed31
 
 
42d7cb6
 
 
 
 
3e8ed31
89002be
3e8ed31
 
89002be
3e8ed31
 
 
 
dd8b6dd
3e8ed31
 
 
 
 
 
42d7cb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import token
import tokenize
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer

def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
    global tokens_
    tokenizer=AutoTokenizer.from_pretrained(tokenizer)
    dataset=load_dataset(dataset, split=split)
    tokens_=0
    def CountTokens(Example):
        global tokens_
        for k,i in enumerate(Example):
            tokens_+=len(tokenizer.tokenize(i))
    categories=[i for i in dataset[0].keys()]
    for cat in categories:
        CountTokens(dataset[cat])
    return tokens_

with gr.Blocks(title="Dataset token counter") as app:
    gr.Markdown("# Token Counter")

    with gr.Row():
        prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="")
        tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2")
        split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train")
        tokens = gr.Label(label="Tokens", elem_id="tokens")
        prompt.submit().success(
            ReturnTokens,
            inputs=[prompt,tokenizer,split],
            outputs=[tokens]
        )

        gr.on(
            triggers=[
                prompt.submit,
                tokenizer.submit,
                split.submit,
            ],
            fn=ReturnTokens,
            inputs=[
                prompt,
                tokenizer,
                split
            ],
            outputs=[tokens],
            api_name="run",
        )

app.launch()