Spaces:
Sleeping
Sleeping
File size: 4,995 Bytes
9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 a911970 9fc1dc2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
import traceback
import gradio as gr
from tokenizer import GujaratiBPETokenizer
# Load the tokenizer
tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
# Function to encode the text and return both the encoded text and compression ratio
def encode_text(input_text):
# Get token IDs
token_ids = tokenizer.encode(input_text)
# Calculate the original text size in bytes
text_byte_length = len(input_text.encode('utf-8'))
# Calculate the number of token IDs
token_id_length = len(token_ids)
compression_ratio = round(text_byte_length / token_id_length, 2) if token_id_length > 0 else 0.0
return token_ids, compression_ratio
# Function to decode the encoded text back to original text
def decode_text(token_ids):
try:
token_ids = list(map(int, token_ids.strip("[]").split(",")))
except Exception as e:
return f"Error in processing token IDs: {e}"
decoded_text = tokenizer.decode(token_ids)
return decoded_text
# Function to clear all input and output textboxes
def clear_all():
return "", "", "", "", ""
# Create Gradio interface
with gr.Blocks() as demo:
article = "<h1 style='text-align: center;'> Gujarati BPE Tokenizer \
<a href='https://github.com/KD1994/session-11-BPE-Tokenizer' target='_blank'> \
<i class='fab fa-github' style='font-size: 24px;'></i></a> \
</h1>"
gr.HTML("""
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
""")
gr.HTML(article)
with gr.Row():
# Column 1: Encoding
with gr.Column():
gr.Markdown("## Encode Gujarati Text")
text_input = gr.Textbox(
label="Input Text",
placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
lines=4,
key="encode_input"
)
encode_button = gr.Button("Encode")
encoded_text_output = gr.Textbox(label="Encoded Text", lines=4, interactive=False)
compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
# Example for encoding
encode_example = gr.Examples(
examples=["ગુજરાત અને ભારતમાં સ્થાન",
"દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
"મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
"આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
inputs=text_input,
outputs=[encoded_text_output, compression_ratio_output],
fn=encode_text
)
# Link the encoding function to the button click
encode_button.click(encode_text,
inputs=text_input,
outputs=[encoded_text_output,
compression_ratio_output]
)
# Column 2: Decoding
with gr.Column():
gr.Markdown("## Decode Tokens to Gujarati Text")
encoded_text_input = gr.Textbox(
label="Enter Token IDs (comma-separated or List)",
placeholder="[2517, 2074, 340, 4, 201]",
lines=4,
key="decode_input"
)
decode_button = gr.Button("Decode")
decoded_text_output = gr.Textbox(label="Decoded Text", lines=3, interactive=False)
decode_button.click(decode_text,
inputs=encoded_text_input,
outputs=decoded_text_output
)
# Add a single clear button at the bottom to clear everything
clear_button = gr.Button("Clear All") # A button to clear everything
# Link the clear button to clear all textboxes
clear_button.click(clear_all, outputs=[text_input,
encoded_text_output,
compression_ratio_output,
encoded_text_input,
decoded_text_output
]
)
# Add error handling to launch
try:
demo.launch()
except Exception as e:
print(f"Error launching interface: {str(e)}")
print(traceback.format_exc())
|