Frinkleko commited on
Commit
6dbdb73
·
verified ·
1 Parent(s): d45033c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -0
app.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import RobertaTokenizer
3
+ import pandas as pd
4
+ import json
5
+
6
+
7
+ tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
8
+
9
+
10
+ def process_text(text, include_special_tokens=False, show_attention_mask=False):
11
+
12
+ encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
13
+
14
+ tokens = tokenizer.tokenize(text)
15
+ token_ids = tokenizer.encode(text)
16
+
17
+ if not include_special_tokens:
18
+ tokens = tokens
19
+ token_ids = token_ids[1:-1]
20
+
21
+ token_info = []
22
+ for token, token_id in zip(tokens, token_ids):
23
+ info = {
24
+ "Token": token,
25
+ "ID": token_id,
26
+ }
27
+
28
+ if show_attention_mask:
29
+ info["Attention Mask"] = encoding["attention_mask"][0][len(token_info)]
30
+
31
+ token_info.append(info)
32
+
33
+ df = pd.DataFrame(token_info)
34
+
35
+ stats = f"""
36
+ Number of tokens: {len(tokens)}
37
+ Input text length: {len(text)}
38
+ Tokens/character ratio: {len(tokens)/len(text):.2f}
39
+ Vocabulary size: {tokenizer.vocab_size}
40
+ """
41
+
42
+ json_output = json.dumps(
43
+ {
44
+ "input_ids": token_ids,
45
+ "tokens": tokens,
46
+ },
47
+ indent=2,
48
+ )
49
+
50
+ return df, stats, json_output
51
+
52
+
53
+ iface = gr.Interface(
54
+ fn=process_text,
55
+ inputs=[
56
+ gr.Textbox(
57
+ lines=5, placeholder="Enter text to tokenize...", label="Input Text"
58
+ ),
59
+ gr.Checkbox(label="Include Special Tokens", value=False),
60
+ gr.Checkbox(label="Show Attention Mask", value=False),
61
+ ],
62
+ outputs=[
63
+ gr.Dataframe(
64
+ headers=["Token", "ID", "Attention Mask"], label="Tokenization Results"
65
+ ),
66
+ gr.Textbox(label="Statistics", lines=4),
67
+ gr.JSON(label="JSON Output"),
68
+ ],
69
+ title="RoBERTa Tokenizer Playground",
70
+ description="""
71
+ An interactive demonstration of the RoBERTa tokenizer.
72
+ """,
73
+ theme="default",
74
+ )
75
+
76
+
77
+ if __name__ == "__main__":
78
+ iface.launch(share=True)