Spaces:

tranquilkd
/

GujaratiTokenizer

Sleeping

App Files Files Community

GujaratiTokenizer / app.py

tranquilkd

First Commit

a911970 6 months ago

raw

history blame

3.87 kB

	import gradio as gr
	from tokenizer import GujaratiBPETokenizer

	# Load the tokenizer
	tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")


	def encode_text(text):
	"""
	Encodes the given Gujarati text into token IDs.
	"""
	token_ids = tokenizer.encode(text)
	return token_ids


	def encode_text_with_compression(text):
	"""
	Encodes the given Gujarati text into token IDs and calculates the compression ratio.
	"""
	# Get token IDs
	token_ids = tokenizer.encode(text)

	# Calculate the original text size in bytes
	text_byte_length = len(text.encode('utf-8'))

	# Calculate the number of token IDs
	token_id_length = len(token_ids)

	# Compression ratio
	if text_byte_length > 0:
	compression_ratio = text_byte_length / token_id_length
	else:
	compression_ratio = 0 # Handle edge case for empty input

	return token_ids, f"{compression_ratio:.2f}"


	def decode_tokens(token_ids):
	"""
	Decodes the given token IDs into Gujarati text.
	"""
	# Ensure token_ids is a list of integers
	try:
	token_ids = list(map(int, token_ids.strip("[]").split(",")))
	except Exception as e:
	return f"Error in processing token IDs: {e}"

	decoded_text = tokenizer.decode(token_ids)
	return decoded_text


	# Gradio interface
	with gr.Blocks() as app:
	gr.Markdown("## Gujarati Tokenizer Encoder-Decoder")

	with gr.Row():
	with gr.Column():
	gr.Markdown("### Encode Gujarati Text to Token IDs")
	Gujarati_text_input = gr.Textbox(
	label="Enter Gujarati Text",
	placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
	lines=4,
	key="encode_input"
	)
	token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
	compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
	encode_button = gr.Button("Encode")

	# Example for encoding
	encode_example = gr.Examples(
	examples=["ગુજરાત અને ભારતમાં સ્થાન",
	"દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
	"મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
	"આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
	inputs=Gujarati_text_input,
	outputs=[token_ids_output, compression_ratio_output],
	fn=encode_text_with_compression
	)

	with gr.Column():
	gr.Markdown("### Decode Token IDs to Gujarati Text")
	token_ids_input = gr.Textbox(
	label="Enter Token IDs (comma-separated or List)",
	placeholder="[2517, 2074, 340, 4, 201]",
	lines=4,
	key="decode_input"
	)
	decoded_text_output = gr.Textbox(label="Decoded Gujarati Text", interactive=False)
	decode_button = gr.Button("Decode")

	encode_button.click(
	encode_text_with_compression,
	inputs=Gujarati_text_input,
	outputs=[token_ids_output, compression_ratio_output]
	)
	decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)

	app.launch()