Spaces:
Sleeping
Sleeping
Boubou78000
commited on
Commit
·
42d7cb6
1
Parent(s):
dd8b6dd
Created app
Browse files
app.py
CHANGED
@@ -2,18 +2,21 @@ import token
|
|
2 |
import tokenize
|
3 |
import gradio as gr
|
4 |
from datasets import load_dataset
|
5 |
-
from
|
6 |
|
7 |
def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
|
8 |
global tokens_
|
9 |
-
tokenizer=
|
10 |
-
dataset=load_dataset(dataset)
|
11 |
tokens_=0
|
12 |
def CountTokens(Example):
|
13 |
global tokens_
|
14 |
-
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
17 |
return tokens_
|
18 |
|
19 |
with gr.Blocks(title="Dataset token counter") as app:
|
@@ -30,4 +33,20 @@ with gr.Blocks(title="Dataset token counter") as app:
|
|
30 |
outputs=[tokens]
|
31 |
)
|
32 |
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import tokenize
|
3 |
import gradio as gr
|
4 |
from datasets import load_dataset
|
5 |
+
from transformers import AutoTokenizer
|
6 |
|
7 |
def ReturnTokens(dataset, tokenizer="openai-community/gpt2", split="train"):
|
8 |
global tokens_
|
9 |
+
tokenizer=AutoTokenizer.from_pretrained(tokenizer)
|
10 |
+
dataset=load_dataset(dataset, split=split)
|
11 |
tokens_=0
|
12 |
def CountTokens(Example):
|
13 |
global tokens_
|
14 |
+
print(Example)
|
15 |
+
for k,i in enumerate(Example):
|
16 |
+
tokens_+=len(tokenizer.tokenize(i))
|
17 |
+
categories=[i for i in dataset[0].keys()]
|
18 |
+
for cat in categories:
|
19 |
+
CountTokens(dataset[cat])
|
20 |
return tokens_
|
21 |
|
22 |
with gr.Blocks(title="Dataset token counter") as app:
|
|
|
33 |
outputs=[tokens]
|
34 |
)
|
35 |
|
36 |
+
gr.on(
|
37 |
+
triggers=[
|
38 |
+
prompt.submit,
|
39 |
+
tokenizer.submit,
|
40 |
+
split.submit,
|
41 |
+
],
|
42 |
+
fn=ReturnTokens,
|
43 |
+
inputs=[
|
44 |
+
prompt,
|
45 |
+
tokenizer,
|
46 |
+
split
|
47 |
+
],
|
48 |
+
outputs=[tokens],
|
49 |
+
api_name="run",
|
50 |
+
)
|
51 |
+
|
52 |
+
app.launch()
|