File size: 4,971 Bytes
a6eaabf f4bdbee a6eaabf f4bdbee a6eaabf 93688f9 f4bdbee a6eaabf f4bdbee a6eaabf 93688f9 a6eaabf 21e0069 93688f9 f4bdbee 93688f9 f4bdbee 93688f9 21e0069 93688f9 f4bdbee 93688f9 21e0069 93688f9 21e0069 f4bdbee 21e0069 93688f9 a6eaabf f4bdbee |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
import gradio as gr
import json
import random
from read_files import Tokenizer # Make sure to include this file
def load_tokenizer(path):
"""Load tokenizer from json file"""
with open(path, 'r') as f:
serialized_merges = json.load(f)
merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
return Tokenizer(merges)
def generate_color():
"""Generate a random pastel color"""
hue = random.random()
saturation = 0.3 + random.random() * 0.2
value = 0.9 + random.random() * 0.1
# Convert HSV to RGB
import colorsys
rgb = colorsys.hsv_to_rgb(hue, saturation, value)
return f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"
# Load tokenizer
tokenizer = load_tokenizer('tokenizer.json')
def encode_text(text):
"""Encode text to tokens"""
# Get the encoded tokens
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
# Create color-coded HTML
colors = {}
html_parts = []
current_pos = 0
# Track each token's bytes and their position in the original text
token_bytes = []
for token in encoded:
if token < 256:
token_bytes.append(bytes([token]))
else:
# Recursively expand merged tokens
def expand_token(t):
if t < 256:
return bytes([t])
pair = next((k for k, v in tokenizer.merges.items() if v == t), None)
if pair:
return expand_token(pair[0]) + expand_token(pair[1])
return b''
token_bytes.append(expand_token(token))
# Convert bytes to text segments and color-code them
current_text = ''
for i, token_byte in enumerate(token_bytes):
try:
token_text = token_byte.decode('utf-8')
if token_text:
if encoded[i] not in colors:
colors[encoded[i]] = generate_color()
color = colors[encoded[i]]
html_parts.append(f'<span style="background-color: {color};">{token_text}</span>')
except UnicodeDecodeError:
continue
colored_text = ''.join(html_parts)
return (
str(encoded),
len(encoded),
decoded,
text == decoded,
colored_text
)
def decode_tokens(token_string):
"""Decode token sequence back to text"""
try:
tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
decoded = tokenizer.decode(tokens)
return decoded, len(tokens)
except Exception as e:
return f"Error: {str(e)}", 0
# Create Gradio interface
with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
gr.Markdown("# Marathi BPE Tokenizer")
with gr.Tab("Encode"):
gr.Markdown("Enter Marathi text to encode it into tokens.")
with gr.Row():
input_text = gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!")
with gr.Row():
encode_btn = gr.Button("Encode")
with gr.Row():
token_ids = gr.Textbox(label="Token IDs")
token_count = gr.Number(label="Token Count")
with gr.Row():
decoded_text = gr.Textbox(label="Decoded Text")
roundtrip_success = gr.Checkbox(label="Successful Round-trip")
with gr.Row():
colored_tokens = gr.HTML(label="Tokenized Text (Color Coded)")
# Add example inputs for encoding
gr.Examples(
examples=[
["नमस्कार, जग!"],
["ही एक चाचणी आहे."],
],
inputs=input_text
)
with gr.Tab("Decode"):
gr.Markdown("Enter a sequence of token IDs to decode them back to text.")
with gr.Row():
input_tokens = gr.Textbox(
label="Input Token IDs",
placeholder="[256, 257, 258]"
)
with gr.Row():
decode_btn = gr.Button("Decode")
with gr.Row():
decoded_result = gr.Textbox(label="Decoded Text")
token_count_decode = gr.Number(label="Token Count")
# Add example inputs for decoding
gr.Examples(
examples=[
["[256, 257, 258, 259]"],
["[260, 261, 262, 263]"],
],
inputs=input_tokens
)
# Set up click events
encode_btn.click(
fn=encode_text,
inputs=input_text,
outputs=[token_ids, token_count, decoded_text, roundtrip_success, colored_tokens]
)
decode_btn.click(
fn=decode_tokens,
inputs=input_tokens,
outputs=[decoded_result, token_count_decode]
)
# Launch the app
if __name__ == "__main__":
iface.launch() |