File size: 1,620 Bytes
3e8ed31
 
89002be
3e8ed31
42d7cb6
89002be
c221aa8
0a88a11
2c3f7c0
c221aa8
2c3f7c0
 
c221aa8
2c3f7c0
 
 
 
 
 
 
 
 
 
 
c221aa8
2c3f7c0
 
 
89002be
3e8ed31
 
89002be
3e8ed31
 
 
 
dd8b6dd
3e8ed31
0a88a11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42d7cb6
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import token
import tokenize
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer

def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"):

    # Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    
    # Load dataset
    dataset = load_dataset(dataset_name, split=split)
    
    # Function to count tokens in a single example
    def count_tokens_in_example(example):
        total_tokens = 0
        tokenized = tokenizer.batch_encode_plus(example)
        for i in tokenized:
            total_tokens+=len(i)
        return total_tokens
    
    tokens_=0

    for field in dataset[0].keys():
        tokens_+=count_tokens_in_example(dataset[field])
    
    return tokens_

with gr.Blocks(title="Dataset token counter") as app:
    gr.Markdown("# Token Counter")

    with gr.Row():
        prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="")
        tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2")
        split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train")
        tokens = gr.Label(label="Tokens", elem_id="tokens")

    gr.on(
        triggers=[
            prompt.submit,
            tokenizer.submit,
            split.submit,
        ],
        fn=ReturnTokens,
        inputs=[
            prompt,
            tokenizer,
            split
        ],
        outputs=[tokens],
        api_name="run",
    )

app.launch()