Spaces:

tranquilkd
/

GujaratiTokenizer

Sleeping

File size: 3,865 Bytes

a911970

import gradio as gr
from tokenizer import GujaratiBPETokenizer

# Load the tokenizer
tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")


def encode_text(text):
    """
    Encodes the given Gujarati text into token IDs.
    """
    token_ids = tokenizer.encode(text)
    return token_ids


def encode_text_with_compression(text):
    """
    Encodes the given Gujarati text into token IDs and calculates the compression ratio.
    """
    # Get token IDs
    token_ids = tokenizer.encode(text)
    
    # Calculate the original text size in bytes
    text_byte_length = len(text.encode('utf-8'))
    
    # Calculate the number of token IDs
    token_id_length = len(token_ids)
    
    # Compression ratio
    if text_byte_length > 0:
        compression_ratio =  text_byte_length / token_id_length
    else:
        compression_ratio = 0  # Handle edge case for empty input
    
    return token_ids, f"{compression_ratio:.2f}"
    

def decode_tokens(token_ids):
    """
    Decodes the given token IDs into Gujarati text.
    """
    # Ensure token_ids is a list of integers
    try:
        token_ids = list(map(int, token_ids.strip("[]").split(",")))
    except Exception as e:
        return f"Error in processing token IDs: {e}"

    decoded_text = tokenizer.decode(token_ids)
    return decoded_text


# Gradio interface
with gr.Blocks() as app:
    gr.Markdown("## Gujarati Tokenizer Encoder-Decoder")
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Encode Gujarati Text to Token IDs")
            Gujarati_text_input = gr.Textbox(
                label="Enter Gujarati Text",
                placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
                lines=4,
                key="encode_input"
            )
            token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
            compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
            encode_button = gr.Button("Encode")
            
            # Example for encoding
            encode_example = gr.Examples(
                examples=["ગુજરાત અને ભારતમાં સ્થાન", 
                          "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે", 
                          "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.", 
                          "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
                inputs=Gujarati_text_input,
                outputs=[token_ids_output, compression_ratio_output],
                fn=encode_text_with_compression
            )
        
        with gr.Column():
            gr.Markdown("### Decode Token IDs to Gujarati Text")
            token_ids_input = gr.Textbox(
                    label="Enter Token IDs (comma-separated or List)",
                    placeholder="[2517, 2074, 340, 4, 201]",
                    lines=4,
                    key="decode_input"
                )
            decoded_text_output = gr.Textbox(label="Decoded Gujarati Text", interactive=False)
            decode_button = gr.Button("Decode")
            
    encode_button.click(
        encode_text_with_compression, 
        inputs=Gujarati_text_input, 
        outputs=[token_ids_output, compression_ratio_output]
    )
    decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)

app.launch()