import traceback
import gradio as gr
from tokenizer import GujaratiBPETokenizer
# Load the tokenizer
tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
# Function to encode the text and return both the encoded text and compression ratio
def encode_text(input_text):
# Get token IDs
token_ids = tokenizer.encode(input_text)
# Calculate the original text size in bytes
text_byte_length = len(input_text.encode('utf-8'))
# Calculate the number of token IDs
token_id_length = len(token_ids)
compression_ratio = round(text_byte_length / token_id_length, 2) if token_id_length > 0 else 0.0
return token_ids, compression_ratio
# Function to decode the encoded text back to original text
def decode_text(token_ids):
try:
token_ids = list(map(int, token_ids.strip("[]").split(",")))
except Exception as e:
return f"Error in processing token IDs: {e}"
decoded_text = tokenizer.decode(token_ids)
return decoded_text
# Function to clear all input and output textboxes
def clear_all():
return "", "", "", "", ""
# Create Gradio interface
with gr.Blocks() as demo:
article = "
"
gr.HTML("""
""")
gr.HTML(article)
with gr.Row():
# Column 1: Encoding
with gr.Column():
gr.Markdown("## Encode Gujarati Text")
text_input = gr.Textbox(
label="Input Text",
placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
lines=4,
key="encode_input"
)
encode_button = gr.Button("Encode")
encoded_text_output = gr.Textbox(label="Encoded Text", lines=4, interactive=False)
compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
# Example for encoding
encode_example = gr.Examples(
examples=["ગુજરાત અને ભારતમાં સ્થાન",
"દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
"મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
"આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
inputs=text_input,
outputs=[encoded_text_output, compression_ratio_output],
fn=encode_text
)
# Link the encoding function to the button click
encode_button.click(encode_text,
inputs=text_input,
outputs=[encoded_text_output,
compression_ratio_output]
)
# Column 2: Decoding
with gr.Column():
gr.Markdown("## Decode Tokens to Gujarati Text")
encoded_text_input = gr.Textbox(
label="Enter Token IDs (comma-separated or List)",
placeholder="[2517, 2074, 340, 4, 201]",
lines=4,
key="decode_input"
)
decode_button = gr.Button("Decode")
decoded_text_output = gr.Textbox(label="Decoded Text", lines=3, interactive=False)
decode_button.click(decode_text,
inputs=encoded_text_input,
outputs=decoded_text_output
)
# Add a single clear button at the bottom to clear everything
clear_button = gr.Button("Clear All") # A button to clear everything
# Link the clear button to clear all textboxes
clear_button.click(clear_all, outputs=[text_input,
encoded_text_output,
compression_ratio_output,
encoded_text_input,
decoded_text_output
]
)
# Add error handling to launch
try:
demo.launch()
except Exception as e:
print(f"Error launching interface: {str(e)}")
print(traceback.format_exc())