Spaces:
Sleeping
Sleeping
File size: 1,640 Bytes
3e8ed31 89002be 3e8ed31 42d7cb6 89002be 3e8ed31 42d7cb6 3e8ed31 42d7cb6 3e8ed31 89002be 3e8ed31 89002be 3e8ed31 dd8b6dd 3e8ed31 42d7cb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import token
import tokenize
import gradio as gr
from datasets import load_dataset
from transformers import AutoTokenizer
def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
global tokens_
tokenizer=AutoTokenizer.from_pretrained(tokenizer)
dataset=load_dataset(dataset, split=split)
tokens_=0
def CountTokens(Example):
global tokens_
for k,i in enumerate(Example):
tokens_+=len(tokenizer.tokenize(i))
categories=[i for i in dataset[0].keys()]
for cat in categories:
CountTokens(dataset[cat])
return tokens_
with gr.Blocks(title="Dataset token counter") as app:
gr.Markdown("# Token Counter")
with gr.Row():
prompt = gr.Textbox(label="Dataset", elem_id="dataset", info="", placeholder="")
tokenizer = gr.Textbox(label="Tokenizer", elem_id="tokenizer", info="", placeholder="openai-community/gpt2", value="openai-community/gpt2")
split = gr.Textbox(label="Split (default: train)", elem_id="split", info="", placeholder="train", value="train")
tokens = gr.Label(label="Tokens", elem_id="tokens")
prompt.submit().success(
ReturnTokens,
inputs=[prompt,tokenizer,split],
outputs=[tokens]
)
gr.on(
triggers=[
prompt.submit,
tokenizer.submit,
split.submit,
],
fn=ReturnTokens,
inputs=[
prompt,
tokenizer,
split
],
outputs=[tokens],
api_name="run",
)
app.launch() |