Spaces:
Sleeping
Sleeping
import token | |
import tokenize | |
import gradio as gr | |
from datasets import load_dataset | |
from transformers import AutoTokenizer | |
def ReturnTokens(dataset_name, tokenizer_name="openai-community/gpt2", split="train"): | |
# Initialize tokenizer | |
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) | |
# Load dataset | |
dataset = load_dataset(dataset_name, split=split) | |
# Function to count tokens in a single example | |
def count_tokens_in_example(example): | |
total_tokens = 0 | |
tokenized = tokenizer.batch_encode_plus(example) | |
for i in tokenized: | |
total_tokens+=len(i) | |
return total_tokens | |
tokens_=0 | |
for field in dataset[0].keys(): | |
tokens_+=count_tokens_in_example(dataset[field]) | |
return tokens_ | |
with gr.Blocks(title="Dataset token counter") as app: | |
gr.Markdown("# Token Counter") | |
with gr.Row(): | |
prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="") | |
tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2") | |
split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train") | |
tokens = gr.Label(label="Tokens", elem_id="tokens") | |
gr.on( | |
triggers=[ | |
prompt.submit, | |
tokenizer.submit, | |
split.submit, | |
], | |
fn=ReturnTokens, | |
inputs=[ | |
prompt, | |
tokenizer, | |
split | |
], | |
outputs=[tokens], | |
api_name="run", | |
) | |
app.launch() |