Add ability to color code.
Browse files
app.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
import json
|
|
|
3 |
from read_files import Tokenizer # Make sure to include this file
|
4 |
|
5 |
def load_tokenizer(path):
|
@@ -9,19 +10,74 @@ def load_tokenizer(path):
|
|
9 |
merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
|
10 |
return Tokenizer(merges)
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
# Load tokenizer
|
13 |
tokenizer = load_tokenizer('tokenizer.json')
|
14 |
|
15 |
def encode_text(text):
|
16 |
"""Encode text to tokens"""
|
|
|
17 |
encoded = tokenizer.encode(text)
|
18 |
decoded = tokenizer.decode(encoded)
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
def decode_tokens(token_string):
|
22 |
"""Decode token sequence back to text"""
|
23 |
try:
|
24 |
-
# Convert string representation of tokens to list of integers
|
25 |
tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
|
26 |
decoded = tokenizer.decode(tokens)
|
27 |
return decoded, len(tokens)
|
@@ -43,9 +99,14 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
|
|
43 |
with gr.Row():
|
44 |
token_ids = gr.Textbox(label="Token IDs")
|
45 |
token_count = gr.Number(label="Token Count")
|
|
|
|
|
46 |
decoded_text = gr.Textbox(label="Decoded Text")
|
47 |
roundtrip_success = gr.Checkbox(label="Successful Round-trip")
|
48 |
|
|
|
|
|
|
|
49 |
# Add example inputs for encoding
|
50 |
gr.Examples(
|
51 |
examples=[
|
@@ -73,7 +134,7 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
|
|
73 |
# Add example inputs for decoding
|
74 |
gr.Examples(
|
75 |
examples=[
|
76 |
-
["[256, 257, 258, 259]"],
|
77 |
["[260, 261, 262, 263]"],
|
78 |
],
|
79 |
inputs=input_tokens
|
@@ -83,7 +144,7 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
|
|
83 |
encode_btn.click(
|
84 |
fn=encode_text,
|
85 |
inputs=input_text,
|
86 |
-
outputs=[token_ids, token_count, decoded_text, roundtrip_success]
|
87 |
)
|
88 |
|
89 |
decode_btn.click(
|
@@ -94,4 +155,4 @@ with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
|
|
94 |
|
95 |
# Launch the app
|
96 |
if __name__ == "__main__":
|
97 |
-
iface.launch()
|
|
|
1 |
import gradio as gr
|
2 |
import json
|
3 |
+
import random
|
4 |
from read_files import Tokenizer # Make sure to include this file
|
5 |
|
6 |
def load_tokenizer(path):
|
|
|
10 |
merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
|
11 |
return Tokenizer(merges)
|
12 |
|
13 |
+
def generate_color():
|
14 |
+
"""Generate a random pastel color"""
|
15 |
+
hue = random.random()
|
16 |
+
saturation = 0.3 + random.random() * 0.2
|
17 |
+
value = 0.9 + random.random() * 0.1
|
18 |
+
|
19 |
+
# Convert HSV to RGB
|
20 |
+
import colorsys
|
21 |
+
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
|
22 |
+
return f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
|
23 |
+
|
24 |
# Load tokenizer
|
25 |
tokenizer = load_tokenizer('tokenizer.json')
|
26 |
|
27 |
def encode_text(text):
|
28 |
"""Encode text to tokens"""
|
29 |
+
# Get the encoded tokens
|
30 |
encoded = tokenizer.encode(text)
|
31 |
decoded = tokenizer.decode(encoded)
|
32 |
+
|
33 |
+
# Create color-coded HTML
|
34 |
+
colors = {}
|
35 |
+
html_parts = []
|
36 |
+
current_pos = 0
|
37 |
+
|
38 |
+
# Track each token's bytes and their position in the original text
|
39 |
+
token_bytes = []
|
40 |
+
for token in encoded:
|
41 |
+
if token < 256:
|
42 |
+
token_bytes.append(bytes([token]))
|
43 |
+
else:
|
44 |
+
# Recursively expand merged tokens
|
45 |
+
def expand_token(t):
|
46 |
+
if t < 256:
|
47 |
+
return bytes([t])
|
48 |
+
pair = next((k for k, v in tokenizer.merges.items() if v == t), None)
|
49 |
+
if pair:
|
50 |
+
return expand_token(pair[0]) + expand_token(pair[1])
|
51 |
+
return b''
|
52 |
+
|
53 |
+
token_bytes.append(expand_token(token))
|
54 |
+
|
55 |
+
# Convert bytes to text segments and color-code them
|
56 |
+
current_text = ''
|
57 |
+
for i, token_byte in enumerate(token_bytes):
|
58 |
+
try:
|
59 |
+
token_text = token_byte.decode('utf-8')
|
60 |
+
if token_text:
|
61 |
+
if encoded[i] not in colors:
|
62 |
+
colors[encoded[i]] = generate_color()
|
63 |
+
color = colors[encoded[i]]
|
64 |
+
html_parts.append(f'<span style="background-color: {color};">{token_text}</span>')
|
65 |
+
except UnicodeDecodeError:
|
66 |
+
continue
|
67 |
+
|
68 |
+
colored_text = ''.join(html_parts)
|
69 |
+
|
70 |
+
return (
|
71 |
+
str(encoded),
|
72 |
+
len(encoded),
|
73 |
+
decoded,
|
74 |
+
text == decoded,
|
75 |
+
colored_text
|
76 |
+
)
|
77 |
|
78 |
def decode_tokens(token_string):
|
79 |
"""Decode token sequence back to text"""
|
80 |
try:
|
|
|
81 |
tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
|
82 |
decoded = tokenizer.decode(tokens)
|
83 |
return decoded, len(tokens)
|
|
|
99 |
with gr.Row():
|
100 |
token_ids = gr.Textbox(label="Token IDs")
|
101 |
token_count = gr.Number(label="Token Count")
|
102 |
+
|
103 |
+
with gr.Row():
|
104 |
decoded_text = gr.Textbox(label="Decoded Text")
|
105 |
roundtrip_success = gr.Checkbox(label="Successful Round-trip")
|
106 |
|
107 |
+
with gr.Row():
|
108 |
+
colored_tokens = gr.HTML(label="Tokenized Text (Color Coded)")
|
109 |
+
|
110 |
# Add example inputs for encoding
|
111 |
gr.Examples(
|
112 |
examples=[
|
|
|
134 |
# Add example inputs for decoding
|
135 |
gr.Examples(
|
136 |
examples=[
|
137 |
+
["[256, 257, 258, 259]"],
|
138 |
["[260, 261, 262, 263]"],
|
139 |
],
|
140 |
inputs=input_tokens
|
|
|
144 |
encode_btn.click(
|
145 |
fn=encode_text,
|
146 |
inputs=input_text,
|
147 |
+
outputs=[token_ids, token_count, decoded_text, roundtrip_success, colored_tokens]
|
148 |
)
|
149 |
|
150 |
decode_btn.click(
|
|
|
155 |
|
156 |
# Launch the app
|
157 |
if __name__ == "__main__":
|
158 |
+
iface.launch()
|