import traceback import gradio as gr from tokenizer import GujaratiBPETokenizer # Load the tokenizer tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json") # Function to encode the text and return both the encoded text and compression ratio def encode_text(input_text): # Get token IDs token_ids = tokenizer.encode(input_text) # Calculate the original text size in bytes text_byte_length = len(input_text.encode('utf-8')) # Calculate the number of token IDs token_id_length = len(token_ids) compression_ratio = round(text_byte_length / token_id_length, 2) if token_id_length > 0 else 0.0 return token_ids, compression_ratio # Function to decode the encoded text back to original text def decode_text(token_ids): try: token_ids = list(map(int, token_ids.strip("[]").split(","))) except Exception as e: return f"Error in processing token IDs: {e}" decoded_text = tokenizer.decode(token_ids) return decoded_text # Function to clear all input and output textboxes def clear_all(): return "", "", "", "", "" # Create Gradio interface with gr.Blocks() as demo: article = "

Gujarati BPE Tokenizer \ \ \

" gr.HTML(""" """) gr.HTML(article) with gr.Row(): # Column 1: Encoding with gr.Column(): gr.Markdown("## Encode Gujarati Text") text_input = gr.Textbox( label="Input Text", placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...", lines=4, key="encode_input" ) encode_button = gr.Button("Encode") encoded_text_output = gr.Textbox(label="Encoded Text", lines=4, interactive=False) compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False) # Example for encoding encode_example = gr.Examples( examples=["ગુજરાત અને ભારતમાં સ્થાન", "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે", "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.", "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."], inputs=text_input, outputs=[encoded_text_output, compression_ratio_output], fn=encode_text ) # Link the encoding function to the button click encode_button.click(encode_text, inputs=text_input, outputs=[encoded_text_output, compression_ratio_output] ) # Column 2: Decoding with gr.Column(): gr.Markdown("## Decode Tokens to Gujarati Text") encoded_text_input = gr.Textbox( label="Enter Token IDs (comma-separated or List)", placeholder="[2517, 2074, 340, 4, 201]", lines=4, key="decode_input" ) decode_button = gr.Button("Decode") decoded_text_output = gr.Textbox(label="Decoded Text", lines=3, interactive=False) decode_button.click(decode_text, inputs=encoded_text_input, outputs=decoded_text_output ) # Add a single clear button at the bottom to clear everything clear_button = gr.Button("Clear All") # A button to clear everything # Link the clear button to clear all textboxes clear_button.click(clear_all, outputs=[text_input, encoded_text_output, compression_ratio_output, encoded_text_input, decoded_text_output ] ) # Add error handling to launch try: demo.launch() except Exception as e: print(f"Error launching interface: {str(e)}") print(traceback.format_exc())