File size: 4,971 Bytes
a6eaabf
 
f4bdbee
a6eaabf
 
 
 
 
 
 
 
 
f4bdbee
 
 
 
 
 
 
 
 
 
 
a6eaabf
 
 
93688f9
 
f4bdbee
a6eaabf
 
f4bdbee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a6eaabf
93688f9
 
 
 
 
 
 
 
 
a6eaabf
21e0069
 
 
93688f9
 
 
 
 
 
 
 
 
 
 
f4bdbee
 
93688f9
 
 
f4bdbee
 
 
93688f9
 
 
 
 
 
 
 
21e0069
93688f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4bdbee
93688f9
 
 
 
21e0069
93688f9
 
 
21e0069
f4bdbee
21e0069
93688f9
 
 
 
 
 
a6eaabf
 
 
f4bdbee
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import gradio as gr
import json
import random
from read_files import Tokenizer  # Make sure to include this file

def load_tokenizer(path):
    """Load tokenizer from json file"""
    with open(path, 'r') as f:
        serialized_merges = json.load(f)
    merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()}
    return Tokenizer(merges)

def generate_color():
    """Generate a random pastel color"""
    hue = random.random()
    saturation = 0.3 + random.random() * 0.2
    value = 0.9 + random.random() * 0.1
    
    # Convert HSV to RGB
    import colorsys
    rgb = colorsys.hsv_to_rgb(hue, saturation, value)
    return f"rgb({int(rgb[0]*255)}, {int(rgb[1]*255)}, {int(rgb[2]*255)})"

# Load tokenizer
tokenizer = load_tokenizer('tokenizer.json')

def encode_text(text):
    """Encode text to tokens"""
    # Get the encoded tokens
    encoded = tokenizer.encode(text)
    decoded = tokenizer.decode(encoded)
    
    # Create color-coded HTML
    colors = {}
    html_parts = []
    current_pos = 0
    
    # Track each token's bytes and their position in the original text
    token_bytes = []
    for token in encoded:
        if token < 256:
            token_bytes.append(bytes([token]))
        else:
            # Recursively expand merged tokens
            def expand_token(t):
                if t < 256:
                    return bytes([t])
                pair = next((k for k, v in tokenizer.merges.items() if v == t), None)
                if pair:
                    return expand_token(pair[0]) + expand_token(pair[1])
                return b''
            
            token_bytes.append(expand_token(token))
    
    # Convert bytes to text segments and color-code them
    current_text = ''
    for i, token_byte in enumerate(token_bytes):
        try:
            token_text = token_byte.decode('utf-8')
            if token_text:
                if encoded[i] not in colors:
                    colors[encoded[i]] = generate_color()
                color = colors[encoded[i]]
                html_parts.append(f'<span style="background-color: {color};">{token_text}</span>')
        except UnicodeDecodeError:
            continue
    
    colored_text = ''.join(html_parts)
    
    return (
        str(encoded),
        len(encoded),
        decoded,
        text == decoded,
        colored_text
    )

def decode_tokens(token_string):
    """Decode token sequence back to text"""
    try:
        tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
        decoded = tokenizer.decode(tokens)
        return decoded, len(tokens)
    except Exception as e:
        return f"Error: {str(e)}", 0

# Create Gradio interface
with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
    gr.Markdown("# Marathi BPE Tokenizer")
    
    with gr.Tab("Encode"):
        gr.Markdown("Enter Marathi text to encode it into tokens.")
        with gr.Row():
            input_text = gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!")
        
        with gr.Row():
            encode_btn = gr.Button("Encode")
        
        with gr.Row():
            token_ids = gr.Textbox(label="Token IDs")
            token_count = gr.Number(label="Token Count")
        
        with gr.Row():
            decoded_text = gr.Textbox(label="Decoded Text")
            roundtrip_success = gr.Checkbox(label="Successful Round-trip")
        
        with gr.Row():
            colored_tokens = gr.HTML(label="Tokenized Text (Color Coded)")
        
        # Add example inputs for encoding
        gr.Examples(
            examples=[
                ["नमस्कार, जग!"],
                ["ही एक चाचणी आहे."],
            ],
            inputs=input_text
        )
    
    with gr.Tab("Decode"):
        gr.Markdown("Enter a sequence of token IDs to decode them back to text.")
        with gr.Row():
            input_tokens = gr.Textbox(
                label="Input Token IDs", 
                placeholder="[256, 257, 258]"
            )
        
        with gr.Row():
            decode_btn = gr.Button("Decode")
        
        with gr.Row():
            decoded_result = gr.Textbox(label="Decoded Text")
            token_count_decode = gr.Number(label="Token Count")
        
        # Add example inputs for decoding
        gr.Examples(
            examples=[
                ["[256, 257, 258, 259]"],
                ["[260, 261, 262, 263]"],
            ],
            inputs=input_tokens
        )
    
    # Set up click events
    encode_btn.click(
        fn=encode_text,
        inputs=input_text,
        outputs=[token_ids, token_count, decoded_text, roundtrip_success, colored_tokens]
    )
    
    decode_btn.click(
        fn=decode_tokens,
        inputs=input_tokens,
        outputs=[decoded_result, token_count_decode]
    )

# Launch the app
if __name__ == "__main__":
    iface.launch()