nragrawal commited on
Commit
f4bdbee
·
1 Parent(s): 93688f9

Add ability to color code.

Browse files
Files changed (1) hide show
  1. app.py +66 -5
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import json
 
3
  from read_files import Tokenizer # Make sure to include this file
4
 
5
  def load_tokenizer(path):
@@ -9,19 +10,74 @@ def load_tokenizer(path):
9
  merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
10
  return Tokenizer(merges)
11
 
 
 
 
 
 
 
 
 
 
 
 
12
  # Load tokenizer
13
  tokenizer = load_tokenizer('tokenizer.json')
14
 
15
  def encode_text(text):
16
  """Encode text to tokens"""
 
17
  encoded = tokenizer.encode(text)
18
  decoded = tokenizer.decode(encoded)
19
- return str(encoded), len(encoded), decoded, text == decoded
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  def decode_tokens(token_string):
22
  """Decode token sequence back to text"""
23
  try:
24
- # Convert string representation of tokens to list of integers
25
  tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
26
  decoded = tokenizer.decode(tokens)
27
  return decoded, len(tokens)
@@ -43,9 +99,14 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
43
  with gr.Row():
44
  token_ids = gr.Textbox(label="Token IDs")
45
  token_count = gr.Number(label="Token Count")
 
 
46
  decoded_text = gr.Textbox(label="Decoded Text")
47
  roundtrip_success = gr.Checkbox(label="Successful Round-trip")
48
 
 
 
 
49
  # Add example inputs for encoding
50
  gr.Examples(
51
  examples=[
@@ -73,7 +134,7 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
73
  # Add example inputs for decoding
74
  gr.Examples(
75
  examples=[
76
- ["[256, 257, 258, 259]"], # Add some actual token sequences here
77
  ["[260, 261, 262, 263]"],
78
  ],
79
  inputs=input_tokens
@@ -83,7 +144,7 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
83
  encode_btn.click(
84
  fn=encode_text,
85
  inputs=input_text,
86
- outputs=[token_ids, token_count, decoded_text, roundtrip_success]
87
  )
88
 
89
  decode_btn.click(
@@ -94,4 +155,4 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
94
 
95
  # Launch the app
96
  if __name__ == "__main__":
97
- iface.launch()
 
1
  import gradio as gr
2
  import json
3
+ import random
4
  from read_files import Tokenizer # Make sure to include this file
5
 
6
  def load_tokenizer(path):
 
10
  merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
11
  return Tokenizer(merges)
12
 
13
+ def generate_color():
14
+ """Generate a random pastel color"""
15
+ hue = random.random()
16
+ saturation = 0.3 + random.random() * 0.2
17
+ value = 0.9 + random.random() * 0.1
18
+
19
+ # Convert HSV to RGB
20
+ import colorsys
21
+ rgb = colorsys.hsv_to_rgb(hue, saturation, value)
22
+ return f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
23
+
24
  # Load tokenizer
25
  tokenizer = load_tokenizer('tokenizer.json')
26
 
27
  def encode_text(text):
28
  """Encode text to tokens"""
29
+ # Get the encoded tokens
30
  encoded = tokenizer.encode(text)
31
  decoded = tokenizer.decode(encoded)
32
+
33
+ # Create color-coded HTML
34
+ colors = {}
35
+ html_parts = []
36
+ current_pos = 0
37
+
38
+ # Track each token's bytes and their position in the original text
39
+ token_bytes = []
40
+ for token in encoded:
41
+ if token < 256:
42
+ token_bytes.append(bytes([token]))
43
+ else:
44
+ # Recursively expand merged tokens
45
+ def expand_token(t):
46
+ if t < 256:
47
+ return bytes([t])
48
+ pair = next((k for k, v in tokenizer.merges.items() if v == t), None)
49
+ if pair:
50
+ return expand_token(pair[0]) + expand_token(pair[1])
51
+ return b''
52
+
53
+ token_bytes.append(expand_token(token))
54
+
55
+ # Convert bytes to text segments and color-code them
56
+ current_text = ''
57
+ for i, token_byte in enumerate(token_bytes):
58
+ try:
59
+ token_text = token_byte.decode('utf-8')
60
+ if token_text:
61
+ if encoded[i] not in colors:
62
+ colors[encoded[i]] = generate_color()
63
+ color = colors[encoded[i]]
64
+ html_parts.append(f'<span style="background-color: {color};">{token_text}</span>')
65
+ except UnicodeDecodeError:
66
+ continue
67
+
68
+ colored_text = ''.join(html_parts)
69
+
70
+ return (
71
+ str(encoded),
72
+ len(encoded),
73
+ decoded,
74
+ text == decoded,
75
+ colored_text
76
+ )
77
 
78
  def decode_tokens(token_string):
79
  """Decode token sequence back to text"""
80
  try:
 
81
  tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
82
  decoded = tokenizer.decode(tokens)
83
  return decoded, len(tokens)
 
99
  with gr.Row():
100
  token_ids = gr.Textbox(label="Token IDs")
101
  token_count = gr.Number(label="Token Count")
102
+
103
+ with gr.Row():
104
  decoded_text = gr.Textbox(label="Decoded Text")
105
  roundtrip_success = gr.Checkbox(label="Successful Round-trip")
106
 
107
+ with gr.Row():
108
+ colored_tokens = gr.HTML(label="Tokenized Text (Color Coded)")
109
+
110
  # Add example inputs for encoding
111
  gr.Examples(
112
  examples=[
 
134
  # Add example inputs for decoding
135
  gr.Examples(
136
  examples=[
137
+ ["[256, 257, 258, 259]"],
138
  ["[260, 261, 262, 263]"],
139
  ],
140
  inputs=input_tokens
 
144
  encode_btn.click(
145
  fn=encode_text,
146
  inputs=input_text,
147
+ outputs=[token_ids, token_count, decoded_text, roundtrip_success, colored_tokens]
148
  )
149
 
150
  decode_btn.click(
 
155
 
156
  # Launch the app
157
  if __name__ == "__main__":
158
+ iface.launch()