Frinkleko commited on
Commit
73ff676
·
verified ·
1 Parent(s): 2060e45

add more base model

Browse files
Files changed (1) hide show
  1. app.py +57 -15
app.py CHANGED
@@ -1,32 +1,59 @@
1
  import gradio as gr
2
- from transformers import RobertaTokenizer
3
  import pandas as pd
4
  import json
5
 
6
-
7
- tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
8
-
9
-
10
- def process_text(text, include_special_tokens=False, show_attention_mask=False):
 
 
 
 
 
 
 
 
11
 
12
  encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
13
 
 
14
  tokens = tokenizer.tokenize(text)
15
  token_ids = tokenizer.encode(text)
16
 
 
17
  if not include_special_tokens:
18
- tokens = tokens
19
- token_ids = token_ids[1:-1]
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  token_info = []
22
- for token, token_id in zip(tokens, token_ids):
 
 
 
 
23
  info = {
24
  "Token": token,
25
  "ID": token_id,
26
  }
27
 
28
- if show_attention_mask:
29
- info["Attention Mask"] = encoding["attention_mask"][0][len(token_info)]
 
30
 
31
  token_info.append(info)
32
 
@@ -45,14 +72,29 @@ def process_text(text, include_special_tokens=False, show_attention_mask=False):
45
  "tokens": tokens,
46
  },
47
  indent=2,
 
48
  )
49
 
50
  return df, stats, json_output
51
 
 
 
 
 
 
 
 
 
 
52
 
53
  iface = gr.Interface(
54
  fn=process_text,
55
  inputs=[
 
 
 
 
 
56
  gr.Textbox(
57
  lines=5, placeholder="Enter text to tokenize...", label="Input Text"
58
  ),
@@ -66,13 +108,13 @@ iface = gr.Interface(
66
  gr.Textbox(label="Statistics", lines=4),
67
  gr.JSON(label="JSON Output"),
68
  ],
69
- title="RoBERTa Tokenizer Playground",
70
  description="""
71
- An interactive demonstration of the RoBERTa tokenizer.
 
72
  """,
73
  theme="default",
74
  )
75
 
76
-
77
  if __name__ == "__main__":
78
- iface.launch(share=True)
 
1
  import gradio as gr
2
+ from transformers import AutoTokenizer
3
  import pandas as pd
4
  import json
5
 
6
+ def process_text(model_name, text, include_special_tokens=False, show_attention_mask=False):
7
+ """
8
+ Processes text using a specified Hugging Face tokenizer model.
9
+ """
10
+ try:
11
+ # Dynamically load the tokenizer based on the selected model name
12
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
13
+ except Exception as e:
14
+ return (
15
+ pd.DataFrame([{"Error": f"Could not load tokenizer for '{model_name}': {e}. Please ensure the model name is correct and accessible (e.g., through Hugging Face Hub or a local path)."}]),
16
+ "",
17
+ "",
18
+ )
19
 
20
  encoding = tokenizer(text, return_tensors="np", padding=True, truncation=True)
21
 
22
+ # Use tokenizer.tokenize and tokenizer.encode for consistency and general compatibility
23
  tokens = tokenizer.tokenize(text)
24
  token_ids = tokenizer.encode(text)
25
 
26
+ # Adjust special token handling based on the flag
27
  if not include_special_tokens:
28
+ # Attempt to remove special tokens by decoding and then encoding without special tokens.
29
+ # This approach aims for a general solution but might behave differently for
30
+ # tokenizers with complex special token handling or if tokens are meant to be inseparable.
31
+ try:
32
+ decoded_text = tokenizer.decode(token_ids, skip_special_tokens=True)
33
+ token_ids = tokenizer.encode(decoded_text, add_special_tokens=False)
34
+ tokens = tokenizer.tokenize(decoded_text, add_special_tokens=False)
35
+ except Exception as e:
36
+ # Fallback if specific handling fails. It's better to process without removing
37
+ # special tokens if an error occurs rather than failing the whole process.
38
+ print(f"Warning: Could not remove special tokens for {model_name}. Error: {e}")
39
+ # Keep original tokens and IDs which include special tokens
40
+ tokens = tokenizer.tokenize(text)
41
+ token_ids = tokenizer.encode(text)
42
 
43
  token_info = []
44
+ # Ensure tokens and token_ids have matching lengths for zipping
45
+ min_len = min(len(tokens), len(token_ids))
46
+ for i in range(min_len):
47
+ token = tokens[i]
48
+ token_id = token_ids[i]
49
  info = {
50
  "Token": token,
51
  "ID": token_id,
52
  }
53
 
54
+ # Check if attention_mask is available and has the correct dimension before accessing
55
+ if show_attention_mask and encoding["attention_mask"].shape[1] > i:
56
+ info["Attention Mask"] = encoding["attention_mask"][0][i]
57
 
58
  token_info.append(info)
59
 
 
72
  "tokens": tokens,
73
  },
74
  indent=2,
75
+ ensure_ascii=False # Ensure non-ASCII characters are not escaped in JSON
76
  )
77
 
78
  return df, stats, json_output
79
 
80
+ # Define available models using your specified paths
81
+ model_choices = [
82
+ "roberta-base",
83
+ "klue/roberta-large",
84
+ "distilbert/distilbert-base-uncased",
85
+ "BAAI/bge-m3-retromae",
86
+ "DTAI-KULeuven/robbert-2023-dutch-base",
87
+ "DTAI-KULeuven/robbert-2023-dutch-large",
88
+ ]
89
 
90
  iface = gr.Interface(
91
  fn=process_text,
92
  inputs=[
93
+ gr.Dropdown(
94
+ choices=model_choices,
95
+ value="roberta-base",
96
+ label="Select Model",
97
+ ),
98
  gr.Textbox(
99
  lines=5, placeholder="Enter text to tokenize...", label="Input Text"
100
  ),
 
108
  gr.Textbox(label="Statistics", lines=4),
109
  gr.JSON(label="JSON Output"),
110
  ],
111
+ title="Hugging Face Tokenizer Playground",
112
  description="""
113
+ An interactive demonstration of various Hugging Face tokenizers.
114
+ Select a model from the dropdown to see how it tokenizes your input text.
115
  """,
116
  theme="default",
117
  )
118
 
 
119
  if __name__ == "__main__":
120
+ iface.launch(share=True)