hotchpotch commited on
Commit
02e5eda
1 Parent(s): bf8e518

Update TokenViz: AutoTokenizer Visualization Tool

Browse files
Files changed (2) hide show
  1. README.md +4 -4
  2. app.py +14 -11
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: Tokenizers
3
- emoji: 😻
4
  colorFrom: green
5
- colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: AutoTokenizerでのtoken化の結果を取得
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: "TokenViz: AutoTokenizer Visualization Tool"
3
+ emoji: 🔍
4
  colorFrom: green
5
+ colorTo: indigo
6
  sdk: streamlit
7
  sdk_version: 1.39.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: Visualize the results of AutoTokenizer
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -17,6 +17,7 @@ DEFAULT_TOKENIZER_NAME = os.environ.get(
17
  DEFAULT_TEXT = """
18
  hello world!
19
  こんにちは、世界!
 
20
  """.strip()
21
 
22
  DEFAULT_COLOR = "gray"
@@ -26,13 +27,13 @@ COLORS_CYCLE = [
26
  ]
27
 
28
 
29
- def color_cycle_generater():
30
- def _color_cycle_generater():
31
  while True:
32
  for color in COLORS_CYCLE:
33
  yield color
34
 
35
- return _color_cycle_generater()
36
 
37
 
38
  @st.cache_resource
@@ -43,18 +44,20 @@ def get_tokenizer(tokenizer_name: str = DEFAULT_TOKENIZER_NAME):
43
 
44
  def main():
45
  st.set_page_config(
46
- page_title="tokenizer visualizer",
47
  layout="centered",
48
  initial_sidebar_state="auto",
49
  )
50
 
51
  st.title("tokenizer visualizer")
52
- st.text_input("tokenizer name", key="tokenizer_name", value=DEFAULT_TOKENIZER_NAME)
 
 
53
  if st.session_state.tokenizer_name:
54
  tokenizer = get_tokenizer(st.session_state.tokenizer_name)
55
  st.text_input("subword prefix", key="subword_prefix", value="##")
56
  st.text_area("text", key="text", height=200, value=DEFAULT_TEXT)
57
- # submit
58
  if st.button("tokenize"):
59
  text = st.session_state.text.strip()
60
  subword_prefix = st.session_state.subword_prefix.strip()
@@ -72,13 +75,13 @@ def main():
72
  st.markdown(f"total tokens: **{total_tokens}**")
73
  tab_main, tab_token_table = st.tabs(["tokens", "table"])
74
 
75
- color_gen = color_cycle_generater()
76
  with tab_main:
77
  current_subword_color = next(color_gen)
78
  token_html = ""
79
  for idx, (token_id, token) in enumerate(zip(token_ids, tokens)):
80
  if len(subword_prefix) == 0:
81
- target_border = f"1px solid {DEFAULT_COLOR}"
82
  else:
83
  current_token_is_subword = token.startswith(subword_prefix)
84
  next_token_is_subword = idx + 1 < total_tokens and tokens[
@@ -89,12 +92,12 @@ def main():
89
  current_subword_color = next(color_gen)
90
 
91
  if current_token_is_subword or next_token_is_subword:
92
- target_border = f"1px solid {current_subword_color}"
93
  else:
94
- target_border = f"1px solid {DEFAULT_COLOR}"
95
 
96
  html_escaped_token = token.replace("<", "&lt;").replace(">", "&gt;")
97
- token_html += f'<span title="{str(token_id)}" style="border: {target_border}; border-radius: 3px; padding: 2px; margin: 2px;">{html_escaped_token}</span>'
98
  st.html(
99
  f"<p style='line-height:2em;'>{token_html}</p>",
100
  )
 
17
  DEFAULT_TEXT = """
18
  hello world!
19
  こんにちは、世界!
20
+ 你好,世界
21
  """.strip()
22
 
23
  DEFAULT_COLOR = "gray"
 
27
  ]
28
 
29
 
30
+ def color_cycle_generator():
31
+ def _color_cycle_generator():
32
  while True:
33
  for color in COLORS_CYCLE:
34
  yield color
35
 
36
+ return _color_cycle_generator()
37
 
38
 
39
  @st.cache_resource
 
44
 
45
  def main():
46
  st.set_page_config(
47
+ page_title="TokenViz: AutoTokenizer Visualization Tool",
48
  layout="centered",
49
  initial_sidebar_state="auto",
50
  )
51
 
52
  st.title("tokenizer visualizer")
53
+ st.text_input(
54
+ "AutoTokenizer model name", key="tokenizer_name", value=DEFAULT_TOKENIZER_NAME
55
+ )
56
  if st.session_state.tokenizer_name:
57
  tokenizer = get_tokenizer(st.session_state.tokenizer_name)
58
  st.text_input("subword prefix", key="subword_prefix", value="##")
59
  st.text_area("text", key="text", height=200, value=DEFAULT_TEXT)
60
+ # Submit
61
  if st.button("tokenize"):
62
  text = st.session_state.text.strip()
63
  subword_prefix = st.session_state.subword_prefix.strip()
 
75
  st.markdown(f"total tokens: **{total_tokens}**")
76
  tab_main, tab_token_table = st.tabs(["tokens", "table"])
77
 
78
+ color_gen = color_cycle_generator()
79
  with tab_main:
80
  current_subword_color = next(color_gen)
81
  token_html = ""
82
  for idx, (token_id, token) in enumerate(zip(token_ids, tokens)):
83
  if len(subword_prefix) == 0:
84
+ token_border = f"1px solid {DEFAULT_COLOR}"
85
  else:
86
  current_token_is_subword = token.startswith(subword_prefix)
87
  next_token_is_subword = idx + 1 < total_tokens and tokens[
 
92
  current_subword_color = next(color_gen)
93
 
94
  if current_token_is_subword or next_token_is_subword:
95
+ token_border = f"1px solid {current_subword_color}"
96
  else:
97
+ token_border = f"1px solid {DEFAULT_COLOR}"
98
 
99
  html_escaped_token = token.replace("<", "&lt;").replace(">", "&gt;")
100
+ token_html += f'<span title="{str(token_id)}" style="border: {token_border}; border-radius: 3px; padding: 2px; margin: 2px;">{html_escaped_token}</span>'
101
  st.html(
102
  f"<p style='line-height:2em;'>{token_html}</p>",
103
  )