Spaces:
Sleeping
Sleeping
Commit
·
a911970
1
Parent(s):
3a0d9b3
First Commit
Browse files- Gujarati_tokenizer.json +0 -0
- app.py +98 -0
- requirements.txt +4 -0
- tokenizer.py +308 -0
Gujarati_tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
app.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from tokenizer import GujaratiBPETokenizer
|
3 |
+
|
4 |
+
# Load the tokenizer
|
5 |
+
tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
|
6 |
+
|
7 |
+
|
8 |
+
def encode_text(text):
|
9 |
+
"""
|
10 |
+
Encodes the given Gujarati text into token IDs.
|
11 |
+
"""
|
12 |
+
token_ids = tokenizer.encode(text)
|
13 |
+
return token_ids
|
14 |
+
|
15 |
+
|
16 |
+
def encode_text_with_compression(text):
|
17 |
+
"""
|
18 |
+
Encodes the given Gujarati text into token IDs and calculates the compression ratio.
|
19 |
+
"""
|
20 |
+
# Get token IDs
|
21 |
+
token_ids = tokenizer.encode(text)
|
22 |
+
|
23 |
+
# Calculate the original text size in bytes
|
24 |
+
text_byte_length = len(text.encode('utf-8'))
|
25 |
+
|
26 |
+
# Calculate the number of token IDs
|
27 |
+
token_id_length = len(token_ids)
|
28 |
+
|
29 |
+
# Compression ratio
|
30 |
+
if text_byte_length > 0:
|
31 |
+
compression_ratio = text_byte_length / token_id_length
|
32 |
+
else:
|
33 |
+
compression_ratio = 0 # Handle edge case for empty input
|
34 |
+
|
35 |
+
return token_ids, f"{compression_ratio:.2f}"
|
36 |
+
|
37 |
+
|
38 |
+
def decode_tokens(token_ids):
|
39 |
+
"""
|
40 |
+
Decodes the given token IDs into Gujarati text.
|
41 |
+
"""
|
42 |
+
# Ensure token_ids is a list of integers
|
43 |
+
try:
|
44 |
+
token_ids = list(map(int, token_ids.strip("[]").split(",")))
|
45 |
+
except Exception as e:
|
46 |
+
return f"Error in processing token IDs: {e}"
|
47 |
+
|
48 |
+
decoded_text = tokenizer.decode(token_ids)
|
49 |
+
return decoded_text
|
50 |
+
|
51 |
+
|
52 |
+
# Gradio interface
|
53 |
+
with gr.Blocks() as app:
|
54 |
+
gr.Markdown("## Gujarati Tokenizer Encoder-Decoder")
|
55 |
+
|
56 |
+
with gr.Row():
|
57 |
+
with gr.Column():
|
58 |
+
gr.Markdown("### Encode Gujarati Text to Token IDs")
|
59 |
+
Gujarati_text_input = gr.Textbox(
|
60 |
+
label="Enter Gujarati Text",
|
61 |
+
placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
|
62 |
+
lines=4,
|
63 |
+
key="encode_input"
|
64 |
+
)
|
65 |
+
token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
|
66 |
+
compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
|
67 |
+
encode_button = gr.Button("Encode")
|
68 |
+
|
69 |
+
# Example for encoding
|
70 |
+
encode_example = gr.Examples(
|
71 |
+
examples=["ગુજરાત અને ભારતમાં સ્થાન",
|
72 |
+
"દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
|
73 |
+
"મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
|
74 |
+
"આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
|
75 |
+
inputs=Gujarati_text_input,
|
76 |
+
outputs=[token_ids_output, compression_ratio_output],
|
77 |
+
fn=encode_text_with_compression
|
78 |
+
)
|
79 |
+
|
80 |
+
with gr.Column():
|
81 |
+
gr.Markdown("### Decode Token IDs to Gujarati Text")
|
82 |
+
token_ids_input = gr.Textbox(
|
83 |
+
label="Enter Token IDs (comma-separated or List)",
|
84 |
+
placeholder="[2517, 2074, 340, 4, 201]",
|
85 |
+
lines=4,
|
86 |
+
key="decode_input"
|
87 |
+
)
|
88 |
+
decoded_text_output = gr.Textbox(label="Decoded Gujarati Text", interactive=False)
|
89 |
+
decode_button = gr.Button("Decode")
|
90 |
+
|
91 |
+
encode_button.click(
|
92 |
+
encode_text_with_compression,
|
93 |
+
inputs=Gujarati_text_input,
|
94 |
+
outputs=[token_ids_output, compression_ratio_output]
|
95 |
+
)
|
96 |
+
decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
|
97 |
+
|
98 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
regex
|
2 |
+
requests
|
3 |
+
pandas
|
4 |
+
tqdm
|
tokenizer.py
ADDED
@@ -0,0 +1,308 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import glob
|
4 |
+
import regex as re
|
5 |
+
import pandas as pd
|
6 |
+
import requests
|
7 |
+
import unicodedata
|
8 |
+
import json
|
9 |
+
from collections import defaultdict, Counter
|
10 |
+
from typing import List, Dict, Tuple, Set
|
11 |
+
from tqdm import tqdm
|
12 |
+
|
13 |
+
|
14 |
+
class GujaratiBPETokenizer:
|
15 |
+
def __init__(self, vocab_size: int = 5000):
|
16 |
+
self.vocab_size = vocab_size
|
17 |
+
self.vocab = {}
|
18 |
+
self.inverse_vocab = {}
|
19 |
+
self.compression_ratio = 0.
|
20 |
+
self.merges = {}
|
21 |
+
self.special_tokens = {
|
22 |
+
'<PAD>': 0,
|
23 |
+
'<UNK>': 1,
|
24 |
+
'<BOS>': 2,
|
25 |
+
'<EOS>': 3
|
26 |
+
}
|
27 |
+
# applies on the entire corpus
|
28 |
+
self.global_pattern = re.compile(r""" [\p{L}\p{M}\p{N}]+|[\p{L}\p{M}\p{N}]+|[^\r\n\p{L}\p{M}\p{N}]+""")
|
29 |
+
# applies on each words to separate morphpligical transformation ending with "ન" or "મ"
|
30 |
+
self.local_pattern = re.compile(r"""([\s\p{L}\p{M}]+|[\s\p{L}\p{M}\p{N}]+)([નમ](?:\p{M}))$""")
|
31 |
+
self.eng2guj = self.get_eng_to_guj_digits_mapping()
|
32 |
+
self.guj_unicode_df = self.get_guj_unicodes()
|
33 |
+
# Initialize basic Odia character vocabulary
|
34 |
+
self.base_vocab = set()
|
35 |
+
# Add basic Odia characters (vowels, consonants, marks)
|
36 |
+
self._initialize_base_vocab()
|
37 |
+
|
38 |
+
|
39 |
+
def get_guj_unicodes(self):
|
40 |
+
res = requests.get("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
|
41 |
+
lines = res.text.splitlines()
|
42 |
+
lines = [",".join(line.split(";")[:2]) for line in lines if "GUJARATI" in line]
|
43 |
+
data = {
|
44 |
+
"code": [l.split(",")[0] for l in lines],
|
45 |
+
"name": [l.split(",")[-1] for l in lines],
|
46 |
+
"char": [unicodedata.lookup(l.split(",")[1]) for l in lines],
|
47 |
+
}
|
48 |
+
df = pd.DataFrame(data)
|
49 |
+
return df
|
50 |
+
|
51 |
+
|
52 |
+
def _initialize_base_vocab(self):
|
53 |
+
"""Initialize vocabulary with basic Odia characters"""
|
54 |
+
# Vowels
|
55 |
+
self.base_vocab.update(self.guj_unicode_df["char"].to_list())
|
56 |
+
# Whitespace characters with period.
|
57 |
+
self.base_vocab.update([' ', '\n', '\t', "."])
|
58 |
+
|
59 |
+
|
60 |
+
def _get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
|
61 |
+
"""Count frequency of adjacent pairs in the vocabulary"""
|
62 |
+
pairs = defaultdict(int)
|
63 |
+
for word in words:
|
64 |
+
for i in range(len(word) - 1):
|
65 |
+
pairs[tuple(word[i:i + 2])] += 1
|
66 |
+
return pairs
|
67 |
+
|
68 |
+
|
69 |
+
def _merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
|
70 |
+
"""Merge all occurrences of the most frequent pair"""
|
71 |
+
first, second = pair
|
72 |
+
new_words = []
|
73 |
+
|
74 |
+
for word in words:
|
75 |
+
i = 0
|
76 |
+
new_word = []
|
77 |
+
while i < len(word):
|
78 |
+
if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
|
79 |
+
new_word.append(first + second)
|
80 |
+
i += 2
|
81 |
+
else:
|
82 |
+
new_word.append(word[i])
|
83 |
+
i += 1
|
84 |
+
new_words.append(new_word)
|
85 |
+
|
86 |
+
return new_words
|
87 |
+
|
88 |
+
def get_eng_to_guj_digits_mapping(self):
|
89 |
+
e2g = dict()
|
90 |
+
# Add digits 0 to 9
|
91 |
+
for i in range(10):
|
92 |
+
e2g[str(i)] = unicodedata.lookup(f"GUJARATI DIGIT {unicodedata.name(chr(48+i)).split()[-1]}")
|
93 |
+
|
94 |
+
return e2g
|
95 |
+
|
96 |
+
|
97 |
+
def remove_eng_words(self, text):
|
98 |
+
pat = re.compile(r"[a-zA-Z]+", re.IGNORECASE)
|
99 |
+
text = " ".join(re.sub(pat, "", text).split())
|
100 |
+
# text = re.sub(pat, "", text))
|
101 |
+
return text
|
102 |
+
|
103 |
+
|
104 |
+
def eng_to_guj_digits(self, text, e2g):
|
105 |
+
new_text = ""
|
106 |
+
for ch in text:
|
107 |
+
if ch.isdigit() and ch not in e2g.values():
|
108 |
+
new_text += e2g[ch]
|
109 |
+
else:
|
110 |
+
new_text += ch
|
111 |
+
|
112 |
+
return new_text
|
113 |
+
|
114 |
+
|
115 |
+
def process_text_with_regex(self, text):
|
116 |
+
split_text = re.findall(self.global_pattern, text)
|
117 |
+
new_text =[]
|
118 |
+
for t in split_text:
|
119 |
+
split_words = re.findall(self.local_pattern, t)
|
120 |
+
# print(f"word: {t} --> word split: {split_words}")
|
121 |
+
if split_words:
|
122 |
+
for item in split_words:
|
123 |
+
if isinstance(item, tuple):
|
124 |
+
w = [i for i in item if i != ""]
|
125 |
+
# print(f"item: {item} --> {w}")
|
126 |
+
new_text.extend(w)
|
127 |
+
else:
|
128 |
+
new_text.append(t)
|
129 |
+
|
130 |
+
return new_text
|
131 |
+
|
132 |
+
def tokenize_text(self, texts: List[str]):
|
133 |
+
"""
|
134 |
+
Takes a list of text and provides list of processed words required for the encoding.
|
135 |
+
|
136 |
+
Args:
|
137 |
+
texts (List[str]): text lines
|
138 |
+
|
139 |
+
Returns:
|
140 |
+
list: list of extraced words from the text lines
|
141 |
+
"""
|
142 |
+
processed_text = []
|
143 |
+
for t in tqdm(texts, desc="preprocessing", colour="green", bar_format="{l_bar}{bar:30}{r_bar}"):
|
144 |
+
processed_text.append(self.eng_to_guj_digits(self.remove_eng_words(t), self.eng2guj))
|
145 |
+
|
146 |
+
processed_text = " ".join(processed_text)
|
147 |
+
words = self.process_text_with_regex(processed_text)
|
148 |
+
|
149 |
+
return words
|
150 |
+
|
151 |
+
|
152 |
+
def train(self, texts: List[str], min_freq: int = 2) -> None:
|
153 |
+
"""Train BPE model on texts"""
|
154 |
+
|
155 |
+
tokens = self.tokenize_text(texts)
|
156 |
+
words = tokens
|
157 |
+
|
158 |
+
vocab = self.base_vocab.copy()
|
159 |
+
num_merges = self.vocab_size - len(self.special_tokens) - len(vocab)
|
160 |
+
# print("num_merges : ", num_merges)
|
161 |
+
# Perform BPE merges
|
162 |
+
train_bar = tqdm(range(num_merges),
|
163 |
+
desc="Merging pairs",
|
164 |
+
total=num_merges,
|
165 |
+
colour="blue",
|
166 |
+
file=sys.stdout,
|
167 |
+
bar_format="{l_bar}{bar:30}{r_bar}"
|
168 |
+
)
|
169 |
+
for i in train_bar:
|
170 |
+
pairs = self._get_stats(words)
|
171 |
+
if not pairs:
|
172 |
+
break
|
173 |
+
|
174 |
+
# Find most frequent pair
|
175 |
+
best_pair = max(pairs.items(), key=lambda x: x[1])
|
176 |
+
if best_pair[1] < min_freq:
|
177 |
+
break
|
178 |
+
|
179 |
+
pair = best_pair[0]
|
180 |
+
new_token = ''.join(pair)
|
181 |
+
vocab.add(new_token)
|
182 |
+
#print("merging ..", pair)
|
183 |
+
# print(len(vocab))
|
184 |
+
# Record the merge operation
|
185 |
+
self.merges[pair] = new_token
|
186 |
+
|
187 |
+
# Merge the pair in all words
|
188 |
+
words = self._merge_vocab(words, pair)
|
189 |
+
|
190 |
+
# Build final vocabulary
|
191 |
+
self.vocab = {**self.special_tokens}
|
192 |
+
idx = len(self.special_tokens)
|
193 |
+
for token in sorted(vocab):
|
194 |
+
self.vocab[token] = idx
|
195 |
+
idx += 1
|
196 |
+
|
197 |
+
self.inverse_vocab = {v: k for k, v in self.vocab.items()}
|
198 |
+
self.compression_ratio = len(tokens) / len(words)
|
199 |
+
print("tokens length:", len(tokens))
|
200 |
+
print("tokens length after merge operation:", len(words))
|
201 |
+
print(f"compression ratio: {len(tokens) / len(words):.2f}X")
|
202 |
+
|
203 |
+
|
204 |
+
def encode(self, text: str) -> List[int]:
|
205 |
+
"""Encode text using learned BPE merges"""
|
206 |
+
|
207 |
+
# odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
|
208 |
+
# extracted_words = odia_word_pattern.findall(text)
|
209 |
+
|
210 |
+
# words = [list(word) for word in extracted_words]
|
211 |
+
#words = [list(text)]
|
212 |
+
|
213 |
+
tokenized_words = self.tokenize_text([text])
|
214 |
+
words = [list(word) for word in tokenized_words]
|
215 |
+
# print("Before merges: ", words)
|
216 |
+
|
217 |
+
# Apply merges in order
|
218 |
+
for pair, merged in self.merges.items():
|
219 |
+
words = self._merge_vocab(words, pair)
|
220 |
+
# print("After mergers: ", words)
|
221 |
+
|
222 |
+
# Convert to token IDs
|
223 |
+
result = []
|
224 |
+
for word in words:
|
225 |
+
for token in word:
|
226 |
+
if token in self.vocab.keys():
|
227 |
+
result.append(self.vocab[token])
|
228 |
+
else:
|
229 |
+
result.append(self.special_tokens['<UNK>'])
|
230 |
+
|
231 |
+
return result
|
232 |
+
|
233 |
+
|
234 |
+
def decode(self, ids: List[int]) -> str:
|
235 |
+
"""Decode token IDs back to text"""
|
236 |
+
return ''.join(self.inverse_vocab.get(id, '<UNK>') for id in ids)
|
237 |
+
|
238 |
+
|
239 |
+
def calculate_compression_ratio(self, text: str) -> float:
|
240 |
+
"""Calculate compression ratio"""
|
241 |
+
encoded = self.encode(text)
|
242 |
+
return len(text) / len(encoded)
|
243 |
+
|
244 |
+
|
245 |
+
def save(self, path: str) -> None:
|
246 |
+
"""Save tokenizer state"""
|
247 |
+
# Convert tuple keys to strings for JSON serialization
|
248 |
+
serializable_merges = {f"{first}|{second}": merged
|
249 |
+
for (first, second), merged in self.merges.items()}
|
250 |
+
|
251 |
+
data = {
|
252 |
+
'vocab': self.vocab,
|
253 |
+
'merges': serializable_merges,
|
254 |
+
'vocab_size': self.vocab_size,
|
255 |
+
'special_tokens': self.special_tokens,
|
256 |
+
'compression_ratio': self.compression_ratio
|
257 |
+
}
|
258 |
+
with open(path, 'w', encoding='utf-8') as f:
|
259 |
+
json.dump(data, f, ensure_ascii=False, indent=2)
|
260 |
+
|
261 |
+
|
262 |
+
@classmethod
|
263 |
+
def load(cls, path: str) -> 'GujaratiBPETokenizer':
|
264 |
+
"""Load tokenizer from file"""
|
265 |
+
with open(path, 'r', encoding='utf-8') as f:
|
266 |
+
data = json.load(f)
|
267 |
+
|
268 |
+
tokenizer = cls(vocab_size=data['vocab_size'])
|
269 |
+
tokenizer.vocab = data['vocab']
|
270 |
+
|
271 |
+
# Convert string keys back to tuples
|
272 |
+
tokenizer.merges = {tuple(k.split('|')): v
|
273 |
+
for k, v in data['merges'].items()}
|
274 |
+
|
275 |
+
tokenizer.special_tokens = data['special_tokens']
|
276 |
+
tokenizer.inverse_vocab = {v: k for k, v in tokenizer.vocab.items()}
|
277 |
+
tokenizer.compression_ratio = data['compression_ratio']
|
278 |
+
print(f"Tokenizer loaded!")
|
279 |
+
return tokenizer
|
280 |
+
|
281 |
+
|
282 |
+
if __name__ == "__main__":
|
283 |
+
# train
|
284 |
+
data_path = os.path.join("data")
|
285 |
+
news_articles = glob.glob(os.path.join(data_path, "news dataset", "*.txt"))
|
286 |
+
cc100_dataset = glob.glob(os.path.join(data_path, "cc100-Gujarati", "*.txt"))
|
287 |
+
indic_dataset = glob.glob(os.path.join(data_path, "IndicCorp", "*.txt"))
|
288 |
+
final_dataset = news_articles + cc100_dataset + indic_dataset
|
289 |
+
|
290 |
+
texts = []
|
291 |
+
c = 0
|
292 |
+
for article in final_dataset:
|
293 |
+
with open(os.path.join(article), "r", encoding='utf-8') as f:
|
294 |
+
texts.append(f.readline().strip())
|
295 |
+
|
296 |
+
tokenizer = GujaratiBPETokenizer()
|
297 |
+
tokenizer.train(texts)
|
298 |
+
tokenizer.save(os.path.join("Gujarati_tokenizer.json"))
|
299 |
+
|
300 |
+
# # test
|
301 |
+
# tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
|
302 |
+
# text1 = "ચામરાજનગર ભારત દેશના દક્ષિણ ભાગમાં આવેલા કર્ણાટક રાજ્યના ચામરાજનગર જિલ્લામાં આવેલું એક નગર છે. ચામરાજનગરમાં ચામરાજનગર જિલ્લાનું મુખ્યાલય છે."
|
303 |
+
# enc_text1 = tokenizer.encode(text1)
|
304 |
+
# print(enc_text1, len(enc_text1))
|
305 |
+
# text2 = tokenizer.decode(enc_text1)
|
306 |
+
# print(text2)
|
307 |
+
|
308 |
+
# assert text1 == text2, "Problem with BPE!!"
|