|
import gradio as gr |
|
import json |
|
from read_files import Tokenizer |
|
|
|
def load_tokenizer(path): |
|
"""Load tokenizer from json file""" |
|
with open(path, 'r') as f: |
|
serialized_merges = json.load(f) |
|
merges = {tuple(map(int, k.split(','))): v for k, v in serialized_merges.items()} |
|
return Tokenizer(merges) |
|
|
|
|
|
tokenizer = load_tokenizer('tokenizer.json') |
|
|
|
def process_text(text): |
|
"""Process text through the tokenizer""" |
|
|
|
encoded = tokenizer.encode(text) |
|
|
|
|
|
decoded = tokenizer.decode(encoded) |
|
|
|
return { |
|
"Encoded Tokens": str(encoded), |
|
"Number of Tokens": len(encoded), |
|
"Decoded Text": decoded, |
|
"Round-trip Success": text == decoded |
|
} |
|
|
|
|
|
iface = gr.Interface( |
|
fn=process_text, |
|
inputs=gr.Textbox(label="Input Marathi Text", placeholder="नमस्कार, जग!"), |
|
outputs={ |
|
"Encoded Tokens": gr.Textbox(label="Token IDs"), |
|
"Number of Tokens": gr.Number(label="Token Count"), |
|
"Decoded Text": gr.Textbox(label="Decoded Text"), |
|
"Round-trip Success": gr.Checkbox(label="Successful Round-trip") |
|
}, |
|
title="Marathi BPE Tokenizer", |
|
description="Enter Marathi text to see how it's tokenized using byte-pair encoding.", |
|
examples=[ |
|
["नमस्कार, जग!"], |
|
["ही एक चाचणी आहे."], |
|
] |
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
iface.launch() |