Spaces:

nragrawal
/

marathi-tokenizer-new-space

Sleeping

App Files Files Community

nragrawal commited on Jan 8

Commit

93688f9

1 Parent(s): 21e0069

Add ability to decode directly.

Browse files

Files changed (1) hide show

app.py +66 -29

app.py CHANGED Viewed

@@ -12,48 +12,85 @@ def load_tokenizer(path):
 # Load tokenizer
 tokenizer = load_tokenizer('tokenizer.json')
-def process_text(text):
-    """Process text through the tokenizer"""
-    # Encode
     encoded = tokenizer.encode(text)
-    # Decode back to verify
     decoded = tokenizer.decode(encoded)
     return str(encoded), len(encoded), decoded, text == decoded
 # Create Gradio interface
 with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
     gr.Markdown("# Marathi BPE Tokenizer")
-    gr.Markdown("Enter Marathi text to see how it's tokenized using byte-pair encoding.")
-    with gr.Row():
-        input_text = gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!")
-    with gr.Row():
-        process_btn = gr.Button("Tokenize")
-    with gr.Row():
-        token_ids = gr.Textbox(label="Token IDs")
-        token_count = gr.Number(label="Token Count")
-        decoded_text = gr.Textbox(label="Decoded Text")
-        roundtrip_success = gr.Checkbox(label="Successful Round-trip")
-    # Add example inputs
-    gr.Examples(
-        examples=[
-            ["नमस्कार, जग!"],
-            ["ही एक चाचणी आहे."],
-        ],
-        inputs=input_text
-    )
-    # Set up click event
-    process_btn.click(
-        fn=process_text,
         inputs=input_text,
         outputs=[token_ids, token_count, decoded_text, roundtrip_success]
     )
 # Launch the app
 if __name__ == "__main__":

 # Load tokenizer
 tokenizer = load_tokenizer('tokenizer.json')
+def encode_text(text):
+    """Encode text to tokens"""
     encoded = tokenizer.encode(text)
     decoded = tokenizer.decode(encoded)
     return str(encoded), len(encoded), decoded, text == decoded
+def decode_tokens(token_string):
+    """Decode token sequence back to text"""
+    try:
+        # Convert string representation of tokens to list of integers
+        tokens = [int(t.strip()) for t in token_string.replace('[', '').replace(']', '').split(',')]
+        decoded = tokenizer.decode(tokens)
+        return decoded, len(tokens)
+    except Exception as e:
+        return f"Error: {str(e)}", 0
 # Create Gradio interface
 with gr.Blocks(title="Marathi BPE Tokenizer") as iface:
     gr.Markdown("# Marathi BPE Tokenizer")
+    with gr.Tab("Encode"):
+        gr.Markdown("Enter Marathi text to encode it into tokens.")
+        with gr.Row():
+            input_text = gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!")
+        with gr.Row():
+            encode_btn = gr.Button("Encode")
+        with gr.Row():
+            token_ids = gr.Textbox(label="Token IDs")
+            token_count = gr.Number(label="Token Count")
+            decoded_text = gr.Textbox(label="Decoded Text")
+            roundtrip_success = gr.Checkbox(label="Successful Round-trip")
+        # Add example inputs for encoding
+        gr.Examples(
+            examples=[
+                ["नमस्कार, जग!"],
+                ["ही एक चाचणी आहे."],
+            ],
+            inputs=input_text
+        )
+    with gr.Tab("Decode"):
+        gr.Markdown("Enter a sequence of token IDs to decode them back to text.")
+        with gr.Row():
+            input_tokens = gr.Textbox(
+                label="Input Token IDs",
+                placeholder="[256, 257, 258]"
+            )
+        with gr.Row():
+            decode_btn = gr.Button("Decode")
+        with gr.Row():
+            decoded_result = gr.Textbox(label="Decoded Text")
+            token_count_decode = gr.Number(label="Token Count")
+        # Add example inputs for decoding
+        gr.Examples(
+            examples=[
+                ["[256, 257, 258, 259]"],  # Add some actual token sequences here
+                ["[260, 261, 262, 263]"],
+            ],
+            inputs=input_tokens
+        )
+    # Set up click events
+    encode_btn.click(
+        fn=encode_text,
         inputs=input_text,
         outputs=[token_ids, token_count, decoded_text, roundtrip_success]
     )
+    decode_btn.click(
+        fn=decode_tokens,
+        inputs=input_tokens,
+        outputs=[decoded_result, token_count_decode]
+    )
 # Launch the app
 if __name__ == "__main__":