nragrawal commited on
Commit
93688f9
·
1 Parent(s): 21e0069

Add ability to decode directly.

Browse files
Files changed (1) hide show
  1. app.py +66 -29
app.py CHANGED
@@ -12,48 +12,85 @@ def load_tokenizer(path):
12
  # Load tokenizer
13
  tokenizer = load_tokenizer('tokenizer.json')
14
 
15
- def process_text(text):
16
- """Process text through the tokenizer"""
17
- # Encode
18
  encoded = tokenizer.encode(text)
19
-
20
- # Decode back to verify
21
  decoded = tokenizer.decode(encoded)
22
-
23
  return str(encoded), len(encoded), decoded, text == decoded
24
 
 
 
 
 
 
 
 
 
 
 
25
  # Create Gradio interface
26
  with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
27
  gr.Markdown("# Marathi BPE Tokenizer")
28
- gr.Markdown("Enter Marathi text to see how it's tokenized using byte-pair encoding.")
29
-
30
- with gr.Row():
31
- input_text = gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!")
32
 
33
- with gr.Row():
34
- process_btn = gr.Button("Tokenize")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- with gr.Row():
37
- token_ids = gr.Textbox(label="Token IDs")
38
- token_count = gr.Number(label="Token Count")
39
- decoded_text = gr.Textbox(label="Decoded Text")
40
- roundtrip_success = gr.Checkbox(label="Successful Round-trip")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
- # Add example inputs
43
- gr.Examples(
44
- examples=[
45
- ["नमस्कार, जग!"],
46
- ["ही एक चाचणी आहे."],
47
- ],
48
- inputs=input_text
49
- )
50
-
51
- # Set up click event
52
- process_btn.click(
53
- fn=process_text,
54
  inputs=input_text,
55
  outputs=[token_ids, token_count, decoded_text, roundtrip_success]
56
  )
 
 
 
 
 
 
57
 
58
  # Launch the app
59
  if __name__ == "__main__":
 
12
  # Load tokenizer
13
  tokenizer = load_tokenizer('tokenizer.json')
14
 
15
+ def encode_text(text):
16
+ """Encode text to tokens"""
 
17
  encoded = tokenizer.encode(text)
 
 
18
  decoded = tokenizer.decode(encoded)
 
19
  return str(encoded), len(encoded), decoded, text == decoded
20
 
21
+ def decode_tokens(token_string):
22
+ """Decode token sequence back to text"""
23
+ try:
24
+ # Convert string representation of tokens to list of integers
25
+ tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
26
+ decoded = tokenizer.decode(tokens)
27
+ return decoded, len(tokens)
28
+ except Exception as e:
29
+ return f"Error: {str(e)}", 0
30
+
31
  # Create Gradio interface
32
  with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
33
  gr.Markdown("# Marathi BPE Tokenizer")
 
 
 
 
34
 
35
+ with gr.Tab("Encode"):
36
+ gr.Markdown("Enter Marathi text to encode it into tokens.")
37
+ with gr.Row():
38
+ input_text = gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!")
39
+
40
+ with gr.Row():
41
+ encode_btn = gr.Button("Encode")
42
+
43
+ with gr.Row():
44
+ token_ids = gr.Textbox(label="Token IDs")
45
+ token_count = gr.Number(label="Token Count")
46
+ decoded_text = gr.Textbox(label="Decoded Text")
47
+ roundtrip_success = gr.Checkbox(label="Successful Round-trip")
48
+
49
+ # Add example inputs for encoding
50
+ gr.Examples(
51
+ examples=[
52
+ ["नमस्कार, जग!"],
53
+ ["ही एक चाचणी आहे."],
54
+ ],
55
+ inputs=input_text
56
+ )
57
 
58
+ with gr.Tab("Decode"):
59
+ gr.Markdown("Enter a sequence of token IDs to decode them back to text.")
60
+ with gr.Row():
61
+ input_tokens = gr.Textbox(
62
+ label="Input Token IDs",
63
+ placeholder="[256, 257, 258]"
64
+ )
65
+
66
+ with gr.Row():
67
+ decode_btn = gr.Button("Decode")
68
+
69
+ with gr.Row():
70
+ decoded_result = gr.Textbox(label="Decoded Text")
71
+ token_count_decode = gr.Number(label="Token Count")
72
+
73
+ # Add example inputs for decoding
74
+ gr.Examples(
75
+ examples=[
76
+ ["[256, 257, 258, 259]"], # Add some actual token sequences here
77
+ ["[260, 261, 262, 263]"],
78
+ ],
79
+ inputs=input_tokens
80
+ )
81
 
82
+ # Set up click events
83
+ encode_btn.click(
84
+ fn=encode_text,
 
 
 
 
 
 
 
 
 
85
  inputs=input_text,
86
  outputs=[token_ids, token_count, decoded_text, roundtrip_success]
87
  )
88
+
89
+ decode_btn.click(
90
+ fn=decode_tokens,
91
+ inputs=input_tokens,
92
+ outputs=[decoded_result, token_count_decode]
93
+ )
94
 
95
  # Launch the app
96
  if __name__ == "__main__":