Spaces:
Sleeping
Sleeping
import traceback | |
import gradio as gr | |
from tokenizer import GujaratiBPETokenizer | |
# Load the tokenizer | |
tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json") | |
# Function to encode the text and return both the encoded text and compression ratio | |
def encode_text(input_text): | |
# Get token IDs | |
token_ids = tokenizer.encode(input_text) | |
# Calculate the original text size in bytes | |
text_byte_length = len(input_text.encode('utf-8')) | |
# Calculate the number of token IDs | |
token_id_length = len(token_ids) | |
compression_ratio = round(text_byte_length / token_id_length, 2) if token_id_length > 0 else 0.0 | |
return token_ids, compression_ratio | |
# Function to decode the encoded text back to original text | |
def decode_text(token_ids): | |
try: | |
token_ids = list(map(int, token_ids.strip("[]").split(","))) | |
except Exception as e: | |
return f"Error in processing token IDs: {e}" | |
decoded_text = tokenizer.decode(token_ids) | |
return decoded_text | |
# Function to clear all input and output textboxes | |
def clear_all(): | |
return "", "", "", "", "" | |
# Create Gradio interface | |
with gr.Blocks() as demo: | |
article = "<h1 style='text-align: center;'> Gujarati BPE Tokenizer \ | |
<a href='https://github.com/KD1994/session-11-BPE-Tokenizer' target='_blank'> \ | |
<i class='fab fa-github' style='font-size: 24px;'></i></a> \ | |
</h1>" | |
gr.HTML(""" | |
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css"> | |
""") | |
gr.HTML(article) | |
with gr.Row(): | |
# Column 1: Encoding | |
with gr.Column(): | |
gr.Markdown("## Encode Gujarati Text") | |
text_input = gr.Textbox( | |
label="Input Text", | |
placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...", | |
lines=4, | |
key="encode_input" | |
) | |
encode_button = gr.Button("Encode") | |
encoded_text_output = gr.Textbox(label="Encoded Text", lines=4, interactive=False) | |
compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False) | |
# Example for encoding | |
encode_example = gr.Examples( | |
examples=["ગુજરાત અને ભારતમાં સ્થાન", | |
"દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે", | |
"મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.", | |
"આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."], | |
inputs=text_input, | |
outputs=[encoded_text_output, compression_ratio_output], | |
fn=encode_text | |
) | |
# Link the encoding function to the button click | |
encode_button.click(encode_text, | |
inputs=text_input, | |
outputs=[encoded_text_output, | |
compression_ratio_output] | |
) | |
# Column 2: Decoding | |
with gr.Column(): | |
gr.Markdown("## Decode Tokens to Gujarati Text") | |
encoded_text_input = gr.Textbox( | |
label="Enter Token IDs (comma-separated or List)", | |
placeholder="[2517, 2074, 340, 4, 201]", | |
lines=4, | |
key="decode_input" | |
) | |
decode_button = gr.Button("Decode") | |
decoded_text_output = gr.Textbox(label="Decoded Text", lines=3, interactive=False) | |
decode_button.click(decode_text, | |
inputs=encoded_text_input, | |
outputs=decoded_text_output | |
) | |
# Add a single clear button at the bottom to clear everything | |
clear_button = gr.Button("Clear All") # A button to clear everything | |
# Link the clear button to clear all textboxes | |
clear_button.click(clear_all, outputs=[text_input, | |
encoded_text_output, | |
compression_ratio_output, | |
encoded_text_input, | |
decoded_text_output | |
] | |
) | |
# Add error handling to launch | |
try: | |
demo.launch() | |
except Exception as e: | |
print(f"Error launching interface: {str(e)}") | |
print(traceback.format_exc()) | |