Spaces:

tranquilkd
/

GujaratiTokenizer

Sleeping

File size: 4,995 Bytes

9fc1dc2
a911970
 
 
 
 
 
 
9fc1dc2
 
a911970
9fc1dc2
a911970
 
9fc1dc2
a911970
 
 
 
9fc1dc2
a911970
9fc1dc2
a911970
9fc1dc2
 
 
a911970
 
 
 
 
 
 
 
 
9fc1dc2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a911970
9fc1dc2
a911970
9fc1dc2
 
 
a911970
 
 
 
 
9fc1dc2
 
a911970
 
 
 
 
 
9fc1dc2
 
 
a911970
9fc1dc2
 
 
 
 
 
 
 
 
 
a911970
9fc1dc2
 
 
 
 
 
 
a911970
9fc1dc2

import traceback
import gradio as gr
from tokenizer import GujaratiBPETokenizer

# Load the tokenizer
tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")


# Function to encode the text and return both the encoded text and compression ratio
def encode_text(input_text):
    # Get token IDs
    token_ids = tokenizer.encode(input_text)
    
    # Calculate the original text size in bytes
    text_byte_length = len(input_text.encode('utf-8'))
    
    # Calculate the number of token IDs
    token_id_length = len(token_ids)
    
    compression_ratio = round(text_byte_length / token_id_length, 2) if token_id_length > 0 else 0.0
    
    return token_ids, compression_ratio


# Function to decode the encoded text back to original text
def decode_text(token_ids):
    try:
        token_ids = list(map(int, token_ids.strip("[]").split(",")))
    except Exception as e:
        return f"Error in processing token IDs: {e}"

    decoded_text = tokenizer.decode(token_ids)
    return decoded_text


# Function to clear all input and output textboxes
def clear_all():
    return "", "", "", "", ""


# Create Gradio interface
with gr.Blocks() as demo:
    article = "<h1 style='text-align: center;'>  Gujarati BPE Tokenizer  \
        <a href='https://github.com/KD1994/session-11-BPE-Tokenizer' target='_blank'> \
            <i class='fab fa-github' style='font-size: 24px;'></i></a> \
            </h1>"
    gr.HTML("""
    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
    """)
    gr.HTML(article)

    with gr.Row():
        # Column 1: Encoding
        with gr.Column():
            gr.Markdown("## Encode Gujarati Text")
            text_input = gr.Textbox(
                label="Input Text", 
                placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
                lines=4,
                key="encode_input"
            )
            encode_button = gr.Button("Encode")
            encoded_text_output = gr.Textbox(label="Encoded Text", lines=4, interactive=False)
            compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
            # Example for encoding
            encode_example = gr.Examples(
                examples=["ગુજરાત અને ભારતમાં સ્થાન", 
                          "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે", 
                          "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.", 
                          "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
                inputs=text_input,
                outputs=[encoded_text_output, compression_ratio_output],
                fn=encode_text
            )
            
            # Link the encoding function to the button click
            encode_button.click(encode_text, 
                                inputs=text_input, 
                                outputs=[encoded_text_output, 
                                         compression_ratio_output]
                                )


        # Column 2: Decoding
        with gr.Column():
            gr.Markdown("## Decode Tokens to Gujarati Text")
            encoded_text_input = gr.Textbox(
                label="Enter Token IDs (comma-separated or List)",
                placeholder="[2517, 2074, 340, 4, 201]",
                lines=4,
                key="decode_input"
            )
            decode_button = gr.Button("Decode")
            decoded_text_output = gr.Textbox(label="Decoded Text", lines=3, interactive=False)

            decode_button.click(decode_text, 
                                inputs=encoded_text_input, 
                                outputs=decoded_text_output
                                )

    # Add a single clear button at the bottom to clear everything
    clear_button = gr.Button("Clear All")  # A button to clear everything

    # Link the clear button to clear all textboxes
    clear_button.click(clear_all, outputs=[text_input, 
                                           encoded_text_output, 
                                           compression_ratio_output, 
                                           encoded_text_input, 
                                           decoded_text_output
                                           ]
                                           )

# Add error handling to launch
try:
    demo.launch()
except Exception as e:
    print(f"Error launching interface: {str(e)}")
    print(traceback.format_exc())