Spaces:
Sleeping
Sleeping
import token | |
import tokenize | |
import gradio as gr | |
from datasets import load_dataset | |
from tokenizers import Tokenizer | |
def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"): | |
global tokens_ | |
tokenizer=Tokenizer.from_pretrained(tokenizer) | |
dataset=load_dataset(dataset) | |
tokens_=0 | |
def CountTokens(Example): | |
global tokens_ | |
for i in Example.values(): | |
tokens_+=len(Tokenizer.encode(i)) | |
dataset.map(CountTokens) | |
return tokens_ | |
with gr.Blocks(title="Dataset token counter") as app: | |
gr.Markdown("# Token Counter") | |
with gr.Row(): | |
prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="") | |
tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2") | |
split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train") | |
tokens = gr.Label(label="Tokens", elem_id="tokens", info="") | |
prompt.submit().success( | |
ReturnTokens, | |
inputs=[prompt,tokenizer,split], | |
outputs=[tokens] | |
) | |
app.launch() |