tranquilkd commited on
Commit
a911970
·
1 Parent(s): 3a0d9b3

First Commit

Browse files
Files changed (4) hide show
  1. Gujarati_tokenizer.json +0 -0
  2. app.py +98 -0
  3. requirements.txt +4 -0
  4. tokenizer.py +308 -0
Gujarati_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from tokenizer import GujaratiBPETokenizer
3
+
4
+ # Load the tokenizer
5
+ tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
6
+
7
+
8
+ def encode_text(text):
9
+ """
10
+ Encodes the given Gujarati text into token IDs.
11
+ """
12
+ token_ids = tokenizer.encode(text)
13
+ return token_ids
14
+
15
+
16
+ def encode_text_with_compression(text):
17
+ """
18
+ Encodes the given Gujarati text into token IDs and calculates the compression ratio.
19
+ """
20
+ # Get token IDs
21
+ token_ids = tokenizer.encode(text)
22
+
23
+ # Calculate the original text size in bytes
24
+ text_byte_length = len(text.encode('utf-8'))
25
+
26
+ # Calculate the number of token IDs
27
+ token_id_length = len(token_ids)
28
+
29
+ # Compression ratio
30
+ if text_byte_length > 0:
31
+ compression_ratio = text_byte_length / token_id_length
32
+ else:
33
+ compression_ratio = 0 # Handle edge case for empty input
34
+
35
+ return token_ids, f"{compression_ratio:.2f}"
36
+
37
+
38
+ def decode_tokens(token_ids):
39
+ """
40
+ Decodes the given token IDs into Gujarati text.
41
+ """
42
+ # Ensure token_ids is a list of integers
43
+ try:
44
+ token_ids = list(map(int, token_ids.strip("[]").split(",")))
45
+ except Exception as e:
46
+ return f"Error in processing token IDs: {e}"
47
+
48
+ decoded_text = tokenizer.decode(token_ids)
49
+ return decoded_text
50
+
51
+
52
+ # Gradio interface
53
+ with gr.Blocks() as app:
54
+ gr.Markdown("## Gujarati Tokenizer Encoder-Decoder")
55
+
56
+ with gr.Row():
57
+ with gr.Column():
58
+ gr.Markdown("### Encode Gujarati Text to Token IDs")
59
+ Gujarati_text_input = gr.Textbox(
60
+ label="Enter Gujarati Text",
61
+ placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
62
+ lines=4,
63
+ key="encode_input"
64
+ )
65
+ token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
66
+ compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
67
+ encode_button = gr.Button("Encode")
68
+
69
+ # Example for encoding
70
+ encode_example = gr.Examples(
71
+ examples=["ગુજરાત અને ભારતમાં સ્થાન",
72
+ "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
73
+ "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
74
+ "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
75
+ inputs=Gujarati_text_input,
76
+ outputs=[token_ids_output, compression_ratio_output],
77
+ fn=encode_text_with_compression
78
+ )
79
+
80
+ with gr.Column():
81
+ gr.Markdown("### Decode Token IDs to Gujarati Text")
82
+ token_ids_input = gr.Textbox(
83
+ label="Enter Token IDs (comma-separated or List)",
84
+ placeholder="[2517, 2074, 340, 4, 201]",
85
+ lines=4,
86
+ key="decode_input"
87
+ )
88
+ decoded_text_output = gr.Textbox(label="Decoded Gujarati Text", interactive=False)
89
+ decode_button = gr.Button("Decode")
90
+
91
+ encode_button.click(
92
+ encode_text_with_compression,
93
+ inputs=Gujarati_text_input,
94
+ outputs=[token_ids_output, compression_ratio_output]
95
+ )
96
+ decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
97
+
98
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ regex
2
+ requests
3
+ pandas
4
+ tqdm
tokenizer.py ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import glob
4
+ import regex as re
5
+ import pandas as pd
6
+ import requests
7
+ import unicodedata
8
+ import json
9
+ from collections import defaultdict, Counter
10
+ from typing import List, Dict, Tuple, Set
11
+ from tqdm import tqdm
12
+
13
+
14
+ class GujaratiBPETokenizer:
15
+ def __init__(self, vocab_size: int = 5000):
16
+ self.vocab_size = vocab_size
17
+ self.vocab = {}
18
+ self.inverse_vocab = {}
19
+ self.compression_ratio = 0.
20
+ self.merges = {}
21
+ self.special_tokens = {
22
+ '<PAD>': 0,
23
+ '<UNK>': 1,
24
+ '<BOS>': 2,
25
+ '<EOS>': 3
26
+ }
27
+ # applies on the entire corpus
28
+ self.global_pattern = re.compile(r""" [\p{L}\p{M}\p{N}]+|[\p{L}\p{M}\p{N}]+|[^\r\n\p{L}\p{M}\p{N}]+""")
29
+ # applies on each words to separate morphpligical transformation ending with "ન" or "મ"
30
+ self.local_pattern = re.compile(r"""([\s\p{L}\p{M}]+|[\s\p{L}\p{M}\p{N}]+)([નમ](?:\p{M}))$""")
31
+ self.eng2guj = self.get_eng_to_guj_digits_mapping()
32
+ self.guj_unicode_df = self.get_guj_unicodes()
33
+ # Initialize basic Odia character vocabulary
34
+ self.base_vocab = set()
35
+ # Add basic Odia characters (vowels, consonants, marks)
36
+ self._initialize_base_vocab()
37
+
38
+
39
+ def get_guj_unicodes(self):
40
+ res = requests.get("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
41
+ lines = res.text.splitlines()
42
+ lines = [",".join(line.split(";")[:2]) for line in lines if "GUJARATI" in line]
43
+ data = {
44
+ "code": [l.split(",")[0] for l in lines],
45
+ "name": [l.split(",")[-1] for l in lines],
46
+ "char": [unicodedata.lookup(l.split(",")[1]) for l in lines],
47
+ }
48
+ df = pd.DataFrame(data)
49
+ return df
50
+
51
+
52
+ def _initialize_base_vocab(self):
53
+ """Initialize vocabulary with basic Odia characters"""
54
+ # Vowels
55
+ self.base_vocab.update(self.guj_unicode_df["char"].to_list())
56
+ # Whitespace characters with period.
57
+ self.base_vocab.update([' ', '\n', '\t', "."])
58
+
59
+
60
+ def _get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
61
+ """Count frequency of adjacent pairs in the vocabulary"""
62
+ pairs = defaultdict(int)
63
+ for word in words:
64
+ for i in range(len(word) - 1):
65
+ pairs[tuple(word[i:i + 2])] += 1
66
+ return pairs
67
+
68
+
69
+ def _merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
70
+ """Merge all occurrences of the most frequent pair"""
71
+ first, second = pair
72
+ new_words = []
73
+
74
+ for word in words:
75
+ i = 0
76
+ new_word = []
77
+ while i < len(word):
78
+ if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
79
+ new_word.append(first + second)
80
+ i += 2
81
+ else:
82
+ new_word.append(word[i])
83
+ i += 1
84
+ new_words.append(new_word)
85
+
86
+ return new_words
87
+
88
+ def get_eng_to_guj_digits_mapping(self):
89
+ e2g = dict()
90
+ # Add digits 0 to 9
91
+ for i in range(10):
92
+ e2g[str(i)] = unicodedata.lookup(f"GUJARATI DIGIT {unicodedata.name(chr(48+i)).split()[-1]}")
93
+
94
+ return e2g
95
+
96
+
97
+ def remove_eng_words(self, text):
98
+ pat = re.compile(r"[a-zA-Z]+", re.IGNORECASE)
99
+ text = " ".join(re.sub(pat, "", text).split())
100
+ # text = re.sub(pat, "", text))
101
+ return text
102
+
103
+
104
+ def eng_to_guj_digits(self, text, e2g):
105
+ new_text = ""
106
+ for ch in text:
107
+ if ch.isdigit() and ch not in e2g.values():
108
+ new_text += e2g[ch]
109
+ else:
110
+ new_text += ch
111
+
112
+ return new_text
113
+
114
+
115
+ def process_text_with_regex(self, text):
116
+ split_text = re.findall(self.global_pattern, text)
117
+ new_text =[]
118
+ for t in split_text:
119
+ split_words = re.findall(self.local_pattern, t)
120
+ # print(f"word: {t} --> word split: {split_words}")
121
+ if split_words:
122
+ for item in split_words:
123
+ if isinstance(item, tuple):
124
+ w = [i for i in item if i != ""]
125
+ # print(f"item: {item} --> {w}")
126
+ new_text.extend(w)
127
+ else:
128
+ new_text.append(t)
129
+
130
+ return new_text
131
+
132
+ def tokenize_text(self, texts: List[str]):
133
+ """
134
+ Takes a list of text and provides list of processed words required for the encoding.
135
+
136
+ Args:
137
+ texts (List[str]): text lines
138
+
139
+ Returns:
140
+ list: list of extraced words from the text lines
141
+ """
142
+ processed_text = []
143
+ for t in tqdm(texts, desc="preprocessing", colour="green", bar_format="{l_bar}{bar:30}{r_bar}"):
144
+ processed_text.append(self.eng_to_guj_digits(self.remove_eng_words(t), self.eng2guj))
145
+
146
+ processed_text = " ".join(processed_text)
147
+ words = self.process_text_with_regex(processed_text)
148
+
149
+ return words
150
+
151
+
152
+ def train(self, texts: List[str], min_freq: int = 2) -> None:
153
+ """Train BPE model on texts"""
154
+
155
+ tokens = self.tokenize_text(texts)
156
+ words = tokens
157
+
158
+ vocab = self.base_vocab.copy()
159
+ num_merges = self.vocab_size - len(self.special_tokens) - len(vocab)
160
+ # print("num_merges : ", num_merges)
161
+ # Perform BPE merges
162
+ train_bar = tqdm(range(num_merges),
163
+ desc="Merging pairs",
164
+ total=num_merges,
165
+ colour="blue",
166
+ file=sys.stdout,
167
+ bar_format="{l_bar}{bar:30}{r_bar}"
168
+ )
169
+ for i in train_bar:
170
+ pairs = self._get_stats(words)
171
+ if not pairs:
172
+ break
173
+
174
+ # Find most frequent pair
175
+ best_pair = max(pairs.items(), key=lambda x: x[1])
176
+ if best_pair[1] < min_freq:
177
+ break
178
+
179
+ pair = best_pair[0]
180
+ new_token = ''.join(pair)
181
+ vocab.add(new_token)
182
+ #print("merging ..", pair)
183
+ # print(len(vocab))
184
+ # Record the merge operation
185
+ self.merges[pair] = new_token
186
+
187
+ # Merge the pair in all words
188
+ words = self._merge_vocab(words, pair)
189
+
190
+ # Build final vocabulary
191
+ self.vocab = {**self.special_tokens}
192
+ idx = len(self.special_tokens)
193
+ for token in sorted(vocab):
194
+ self.vocab[token] = idx
195
+ idx += 1
196
+
197
+ self.inverse_vocab = {v: k for k, v in self.vocab.items()}
198
+ self.compression_ratio = len(tokens) / len(words)
199
+ print("tokens length:", len(tokens))
200
+ print("tokens length after merge operation:", len(words))
201
+ print(f"compression ratio: {len(tokens) / len(words):.2f}X")
202
+
203
+
204
+ def encode(self, text: str) -> List[int]:
205
+ """Encode text using learned BPE merges"""
206
+
207
+ # odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
208
+ # extracted_words = odia_word_pattern.findall(text)
209
+
210
+ # words = [list(word) for word in extracted_words]
211
+ #words = [list(text)]
212
+
213
+ tokenized_words = self.tokenize_text([text])
214
+ words = [list(word) for word in tokenized_words]
215
+ # print("Before merges: ", words)
216
+
217
+ # Apply merges in order
218
+ for pair, merged in self.merges.items():
219
+ words = self._merge_vocab(words, pair)
220
+ # print("After mergers: ", words)
221
+
222
+ # Convert to token IDs
223
+ result = []
224
+ for word in words:
225
+ for token in word:
226
+ if token in self.vocab.keys():
227
+ result.append(self.vocab[token])
228
+ else:
229
+ result.append(self.special_tokens['<UNK>'])
230
+
231
+ return result
232
+
233
+
234
+ def decode(self, ids: List[int]) -> str:
235
+ """Decode token IDs back to text"""
236
+ return ''.join(self.inverse_vocab.get(id, '<UNK>') for id in ids)
237
+
238
+
239
+ def calculate_compression_ratio(self, text: str) -> float:
240
+ """Calculate compression ratio"""
241
+ encoded = self.encode(text)
242
+ return len(text) / len(encoded)
243
+
244
+
245
+ def save(self, path: str) -> None:
246
+ """Save tokenizer state"""
247
+ # Convert tuple keys to strings for JSON serialization
248
+ serializable_merges = {f"{first}|{second}": merged
249
+ for (first, second), merged in self.merges.items()}
250
+
251
+ data = {
252
+ 'vocab': self.vocab,
253
+ 'merges': serializable_merges,
254
+ 'vocab_size': self.vocab_size,
255
+ 'special_tokens': self.special_tokens,
256
+ 'compression_ratio': self.compression_ratio
257
+ }
258
+ with open(path, 'w', encoding='utf-8') as f:
259
+ json.dump(data, f, ensure_ascii=False, indent=2)
260
+
261
+
262
+ @classmethod
263
+ def load(cls, path: str) -> 'GujaratiBPETokenizer':
264
+ """Load tokenizer from file"""
265
+ with open(path, 'r', encoding='utf-8') as f:
266
+ data = json.load(f)
267
+
268
+ tokenizer = cls(vocab_size=data['vocab_size'])
269
+ tokenizer.vocab = data['vocab']
270
+
271
+ # Convert string keys back to tuples
272
+ tokenizer.merges = {tuple(k.split('|')): v
273
+ for k, v in data['merges'].items()}
274
+
275
+ tokenizer.special_tokens = data['special_tokens']
276
+ tokenizer.inverse_vocab = {v: k for k, v in tokenizer.vocab.items()}
277
+ tokenizer.compression_ratio = data['compression_ratio']
278
+ print(f"Tokenizer loaded!")
279
+ return tokenizer
280
+
281
+
282
+ if __name__ == "__main__":
283
+ # train
284
+ data_path = os.path.join("data")
285
+ news_articles = glob.glob(os.path.join(data_path, "news dataset", "*.txt"))
286
+ cc100_dataset = glob.glob(os.path.join(data_path, "cc100-Gujarati", "*.txt"))
287
+ indic_dataset = glob.glob(os.path.join(data_path, "IndicCorp", "*.txt"))
288
+ final_dataset = news_articles + cc100_dataset + indic_dataset
289
+
290
+ texts = []
291
+ c = 0
292
+ for article in final_dataset:
293
+ with open(os.path.join(article), "r", encoding='utf-8') as f:
294
+ texts.append(f.readline().strip())
295
+
296
+ tokenizer = GujaratiBPETokenizer()
297
+ tokenizer.train(texts)
298
+ tokenizer.save(os.path.join("Gujarati_tokenizer.json"))
299
+
300
+ # # test
301
+ # tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
302
+ # text1 = "ચામરાજનગર ભારત દેશના દક્ષિણ ભાગમાં આવેલા કર્ણાટક રાજ્યના ચામરાજનગર જિલ્લામાં આવેલું એક નગર છે. ચામરાજનગરમાં ચામરાજનગર જિલ્લાનું મુખ્યાલય છે."
303
+ # enc_text1 = tokenizer.encode(text1)
304
+ # print(enc_text1, len(enc_text1))
305
+ # text2 = tokenizer.decode(enc_text1)
306
+ # print(text2)
307
+
308
+ # assert text1 == text2, "Problem with BPE!!"