Spaces:

tranquilkd
/

GujaratiTokenizer

Sleeping

App Files Files Community

tranquilkd commited on Jan 11

Commit

9fc1dc2

1 Parent(s): a911970

UI change

Browse files

Files changed (2) hide show

.gitignore +1 -0
app.py +77 -57

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ __pycache__

app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import gradio as gr
 from tokenizer import GujaratiBPETokenizer
@@ -5,41 +6,24 @@ from tokenizer import GujaratiBPETokenizer
 tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
-def encode_text(text):
-    """
-    Encodes the given Gujarati text into token IDs.
-    """
-    token_ids = tokenizer.encode(text)
-    return token_ids
-def encode_text_with_compression(text):
-    """
-    Encodes the given Gujarati text into token IDs and calculates the compression ratio.
-    """
     # Get token IDs
-    token_ids = tokenizer.encode(text)
     # Calculate the original text size in bytes
-    text_byte_length = len(text.encode('utf-8'))
     # Calculate the number of token IDs
     token_id_length = len(token_ids)
-    # Compression ratio
-    if text_byte_length > 0:
-        compression_ratio =  text_byte_length / token_id_length
-    else:
-        compression_ratio = 0  # Handle edge case for empty input
-    return token_ids, f"{compression_ratio:.2f}"
-def decode_tokens(token_ids):
-    """
-    Decodes the given token IDs into Gujarati text.
-    """
-    # Ensure token_ids is a list of integers
     try:
         token_ids = list(map(int, token_ids.strip("[]").split(",")))
     except Exception as e:
@@ -49,50 +33,86 @@ def decode_tokens(token_ids):
     return decoded_text
-# Gradio interface
-with gr.Blocks() as app:
-    gr.Markdown("## Gujarati Tokenizer Encoder-Decoder")
     with gr.Row():
         with gr.Column():
-            gr.Markdown("### Encode Gujarati Text to Token IDs")
-            Gujarati_text_input = gr.Textbox(
-                label="Enter Gujarati Text",
                 placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
                 lines=4,
                 key="encode_input"
             )
-            token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
-            compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
             encode_button = gr.Button("Encode")
             # Example for encoding
             encode_example = gr.Examples(
                 examples=["ગુજરાત અને ભારતમાં સ્થાન",
                           "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
                           "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
                           "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
-                inputs=Gujarati_text_input,
-                outputs=[token_ids_output, compression_ratio_output],
-                fn=encode_text_with_compression
             )
         with gr.Column():
-            gr.Markdown("### Decode Token IDs to Gujarati Text")
-            token_ids_input = gr.Textbox(
-                    label="Enter Token IDs (comma-separated or List)",
-                    placeholder="[2517, 2074, 340, 4, 201]",
-                    lines=4,
-                    key="decode_input"
-                )
-            decoded_text_output = gr.Textbox(label="Decoded Gujarati Text", interactive=False)
             decode_button = gr.Button("Decode")
-    encode_button.click(
-        encode_text_with_compression,
-        inputs=Gujarati_text_input,
-        outputs=[token_ids_output, compression_ratio_output]
-    )
-    decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
-app.launch()

+import traceback
 import gradio as gr
 from tokenizer import GujaratiBPETokenizer
 tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
+# Function to encode the text and return both the encoded text and compression ratio
+def encode_text(input_text):
     # Get token IDs
+    token_ids = tokenizer.encode(input_text)
     # Calculate the original text size in bytes
+    text_byte_length = len(input_text.encode('utf-8'))
     # Calculate the number of token IDs
     token_id_length = len(token_ids)
+    compression_ratio = round(text_byte_length / token_id_length, 2) if token_id_length > 0 else 0.0
+    return token_ids, compression_ratio
+# Function to decode the encoded text back to original text
+def decode_text(token_ids):
     try:
         token_ids = list(map(int, token_ids.strip("[]").split(",")))
     except Exception as e:
     return decoded_text
+# Function to clear all input and output textboxes
+def clear_all():
+    return "", "", "", "", ""
+# Create Gradio interface
+with gr.Blocks() as demo:
+    article = "<h1 style='text-align: center;'>  Gujarati BPE Tokenizer  \
+        <a href='https://github.com/KD1994/session-11-BPE-Tokenizer' target='_blank'> \
+            <i class='fab fa-github' style='font-size: 24px;'></i></a> \
+            </h1>"
+    gr.HTML("""
+    <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
+    """)
+    gr.HTML(article)
     with gr.Row():
+        # Column 1: Encoding
         with gr.Column():
+            gr.Markdown("## Encode Gujarati Text")
+            text_input = gr.Textbox(
+                label="Input Text",
                 placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
                 lines=4,
                 key="encode_input"
             )
             encode_button = gr.Button("Encode")
+            encoded_text_output = gr.Textbox(label="Encoded Text", lines=4, interactive=False)
+            compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
             # Example for encoding
             encode_example = gr.Examples(
                 examples=["ગુજરાત અને ભારતમાં સ્થાન",
                           "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
                           "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
                           "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
+                inputs=text_input,
+                outputs=[encoded_text_output, compression_ratio_output],
+                fn=encode_text
             )
+            # Link the encoding function to the button click
+            encode_button.click(encode_text,
+                                inputs=text_input,
+                                outputs=[encoded_text_output,
+                                         compression_ratio_output]
+                                )
+        # Column 2: Decoding
         with gr.Column():
+            gr.Markdown("## Decode Tokens to Gujarati Text")
+            encoded_text_input = gr.Textbox(
+                label="Enter Token IDs (comma-separated or List)",
+                placeholder="[2517, 2074, 340, 4, 201]",
+                lines=4,
+                key="decode_input"
+            )
             decode_button = gr.Button("Decode")
+            decoded_text_output = gr.Textbox(label="Decoded Text", lines=3, interactive=False)
+            decode_button.click(decode_text,
+                                inputs=encoded_text_input,
+                                outputs=decoded_text_output
+                                )
+    # Add a single clear button at the bottom to clear everything
+    clear_button = gr.Button("Clear All")  # A button to clear everything
+    # Link the clear button to clear all textboxes
+    clear_button.click(clear_all, outputs=[text_input,
+                                           encoded_text_output,
+                                           compression_ratio_output,
+                                           encoded_text_input,
+                                           decoded_text_output
+                                           ]
+                                           )
+# Add error handling to launch
+try:
+    demo.launch()
+except Exception as e:
+    print(f"Error launching interface: {str(e)}")
+    print(traceback.format_exc())