tranquilkd commited on
Commit
9fc1dc2
·
1 Parent(s): a911970
Files changed (2) hide show
  1. .gitignore +1 -0
  2. app.py +77 -57
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  from tokenizer import GujaratiBPETokenizer
3
 
@@ -5,41 +6,24 @@ from tokenizer import GujaratiBPETokenizer
5
  tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
6
 
7
 
8
- def encode_text(text):
9
- """
10
- Encodes the given Gujarati text into token IDs.
11
- """
12
- token_ids = tokenizer.encode(text)
13
- return token_ids
14
-
15
-
16
- def encode_text_with_compression(text):
17
- """
18
- Encodes the given Gujarati text into token IDs and calculates the compression ratio.
19
- """
20
  # Get token IDs
21
- token_ids = tokenizer.encode(text)
22
 
23
  # Calculate the original text size in bytes
24
- text_byte_length = len(text.encode('utf-8'))
25
 
26
  # Calculate the number of token IDs
27
  token_id_length = len(token_ids)
28
 
29
- # Compression ratio
30
- if text_byte_length > 0:
31
- compression_ratio = text_byte_length / token_id_length
32
- else:
33
- compression_ratio = 0 # Handle edge case for empty input
34
-
35
- return token_ids, f"{compression_ratio:.2f}"
36
 
 
37
 
38
- def decode_tokens(token_ids):
39
- """
40
- Decodes the given token IDs into Gujarati text.
41
- """
42
- # Ensure token_ids is a list of integers
43
  try:
44
  token_ids = list(map(int, token_ids.strip("[]").split(",")))
45
  except Exception as e:
@@ -49,50 +33,86 @@ def decode_tokens(token_ids):
49
  return decoded_text
50
 
51
 
52
- # Gradio interface
53
- with gr.Blocks() as app:
54
- gr.Markdown("## Gujarati Tokenizer Encoder-Decoder")
55
-
 
 
 
 
 
 
 
 
 
 
 
 
56
  with gr.Row():
 
57
  with gr.Column():
58
- gr.Markdown("### Encode Gujarati Text to Token IDs")
59
- Gujarati_text_input = gr.Textbox(
60
- label="Enter Gujarati Text",
61
  placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
62
  lines=4,
63
  key="encode_input"
64
  )
65
- token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
66
- compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
67
  encode_button = gr.Button("Encode")
68
-
 
69
  # Example for encoding
70
  encode_example = gr.Examples(
71
  examples=["ગુજરાત અને ભારતમાં સ્થાન",
72
  "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
73
  "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
74
  "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
75
- inputs=Gujarati_text_input,
76
- outputs=[token_ids_output, compression_ratio_output],
77
- fn=encode_text_with_compression
78
  )
79
-
 
 
 
 
 
 
 
 
 
80
  with gr.Column():
81
- gr.Markdown("### Decode Token IDs to Gujarati Text")
82
- token_ids_input = gr.Textbox(
83
- label="Enter Token IDs (comma-separated or List)",
84
- placeholder="[2517, 2074, 340, 4, 201]",
85
- lines=4,
86
- key="decode_input"
87
- )
88
- decoded_text_output = gr.Textbox(label="Decoded Gujarati Text", interactive=False)
89
  decode_button = gr.Button("Decode")
90
-
91
- encode_button.click(
92
- encode_text_with_compression,
93
- inputs=Gujarati_text_input,
94
- outputs=[token_ids_output, compression_ratio_output]
95
- )
96
- decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
97
-
98
- app.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import traceback
2
  import gradio as gr
3
  from tokenizer import GujaratiBPETokenizer
4
 
 
6
  tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
7
 
8
 
9
+ # Function to encode the text and return both the encoded text and compression ratio
10
+ def encode_text(input_text):
 
 
 
 
 
 
 
 
 
 
11
  # Get token IDs
12
+ token_ids = tokenizer.encode(input_text)
13
 
14
  # Calculate the original text size in bytes
15
+ text_byte_length = len(input_text.encode('utf-8'))
16
 
17
  # Calculate the number of token IDs
18
  token_id_length = len(token_ids)
19
 
20
+ compression_ratio = round(text_byte_length / token_id_length, 2) if token_id_length > 0 else 0.0
 
 
 
 
 
 
21
 
22
+ return token_ids, compression_ratio
23
 
24
+
25
+ # Function to decode the encoded text back to original text
26
+ def decode_text(token_ids):
 
 
27
  try:
28
  token_ids = list(map(int, token_ids.strip("[]").split(",")))
29
  except Exception as e:
 
33
  return decoded_text
34
 
35
 
36
+ # Function to clear all input and output textboxes
37
+ def clear_all():
38
+ return "", "", "", "", ""
39
+
40
+
41
+ # Create Gradio interface
42
+ with gr.Blocks() as demo:
43
+ article = "<h1 style='text-align: center;'> Gujarati BPE Tokenizer \
44
+ <a href='https://github.com/KD1994/session-11-BPE-Tokenizer' target='_blank'> \
45
+ <i class='fab fa-github' style='font-size: 24px;'></i></a> \
46
+ </h1>"
47
+ gr.HTML("""
48
+ <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0-beta3/css/all.min.css">
49
+ """)
50
+ gr.HTML(article)
51
+
52
  with gr.Row():
53
+ # Column 1: Encoding
54
  with gr.Column():
55
+ gr.Markdown("## Encode Gujarati Text")
56
+ text_input = gr.Textbox(
57
+ label="Input Text",
58
  placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
59
  lines=4,
60
  key="encode_input"
61
  )
 
 
62
  encode_button = gr.Button("Encode")
63
+ encoded_text_output = gr.Textbox(label="Encoded Text", lines=4, interactive=False)
64
+ compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
65
  # Example for encoding
66
  encode_example = gr.Examples(
67
  examples=["ગુજરાત અને ભારતમાં સ્થાન",
68
  "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
69
  "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
70
  "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
71
+ inputs=text_input,
72
+ outputs=[encoded_text_output, compression_ratio_output],
73
+ fn=encode_text
74
  )
75
+
76
+ # Link the encoding function to the button click
77
+ encode_button.click(encode_text,
78
+ inputs=text_input,
79
+ outputs=[encoded_text_output,
80
+ compression_ratio_output]
81
+ )
82
+
83
+
84
+ # Column 2: Decoding
85
  with gr.Column():
86
+ gr.Markdown("## Decode Tokens to Gujarati Text")
87
+ encoded_text_input = gr.Textbox(
88
+ label="Enter Token IDs (comma-separated or List)",
89
+ placeholder="[2517, 2074, 340, 4, 201]",
90
+ lines=4,
91
+ key="decode_input"
92
+ )
 
93
  decode_button = gr.Button("Decode")
94
+ decoded_text_output = gr.Textbox(label="Decoded Text", lines=3, interactive=False)
95
+
96
+ decode_button.click(decode_text,
97
+ inputs=encoded_text_input,
98
+ outputs=decoded_text_output
99
+ )
100
+
101
+ # Add a single clear button at the bottom to clear everything
102
+ clear_button = gr.Button("Clear All") # A button to clear everything
103
+
104
+ # Link the clear button to clear all textboxes
105
+ clear_button.click(clear_all, outputs=[text_input,
106
+ encoded_text_output,
107
+ compression_ratio_output,
108
+ encoded_text_input,
109
+ decoded_text_output
110
+ ]
111
+ )
112
+
113
+ # Add error handling to launch
114
+ try:
115
+ demo.launch()
116
+ except Exception as e:
117
+ print(f"Error launching interface: {str(e)}")
118
+ print(traceback.format_exc())