import token import tokenize import gradio as gr from datasets import load_dataset from transformers import AutoTokenizer def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"): tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) dataset = load_dataset(dataset_name, split=split) def count_tokens(examples): return sum(len(tokenizer.tokenize(example)) for example in examples) total_tokens = 0 for field in dataset[0].keys(): total_tokens += count_tokens(dataset[field]) return total_tokens with gr.Blocks(title="Dataset token counter") as app: gr.Markdown("# Token Counter") with gr.Row(): prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="") tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2") split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train") tokens = gr.Label(label="Tokens", elem_id="tokens") prompt.submit().success( ReturnTokens, inputs=[prompt,tokenizer,split], outputs=[tokens] ) gr.on( triggers=[ prompt.submit, tokenizer.submit, split.submit, ], fn=ReturnTokens, inputs=[ prompt, tokenizer, split ], outputs=[tokens], api_name="run", ) app.launch()