Spaces:

tranquilkd
/

GujaratiTokenizer

Running

App Files Files Community

tranquilkd commited on Jan 11

Commit

a911970

1 Parent(s): 3a0d9b3

First Commit

Browse files

Files changed (4) hide show

Gujarati_tokenizer.json +0 -0
app.py +98 -0
requirements.txt +4 -0
tokenizer.py +308 -0

Gujarati_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import gradio as gr
+from tokenizer import GujaratiBPETokenizer
+# Load the tokenizer
+tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
+def encode_text(text):
+    """
+    Encodes the given Gujarati text into token IDs.
+    """
+    token_ids = tokenizer.encode(text)
+    return token_ids
+def encode_text_with_compression(text):
+    """
+    Encodes the given Gujarati text into token IDs and calculates the compression ratio.
+    """
+    # Get token IDs
+    token_ids = tokenizer.encode(text)
+    # Calculate the original text size in bytes
+    text_byte_length = len(text.encode('utf-8'))
+    # Calculate the number of token IDs
+    token_id_length = len(token_ids)
+    # Compression ratio
+    if text_byte_length > 0:
+        compression_ratio =  text_byte_length / token_id_length
+    else:
+        compression_ratio = 0  # Handle edge case for empty input
+    return token_ids, f"{compression_ratio:.2f}"
+def decode_tokens(token_ids):
+    """
+    Decodes the given token IDs into Gujarati text.
+    """
+    # Ensure token_ids is a list of integers
+    try:
+        token_ids = list(map(int, token_ids.strip("[]").split(",")))
+    except Exception as e:
+        return f"Error in processing token IDs: {e}"
+    decoded_text = tokenizer.decode(token_ids)
+    return decoded_text
+# Gradio interface
+with gr.Blocks() as app:
+    gr.Markdown("## Gujarati Tokenizer Encoder-Decoder")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("### Encode Gujarati Text to Token IDs")
+            Gujarati_text_input = gr.Textbox(
+                label="Enter Gujarati Text",
+                placeholder="આ અહીં ગુજરાતી ટેક્સ્ટ લખો...",
+                lines=4,
+                key="encode_input"
+            )
+            token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False)
+            compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False)
+            encode_button = gr.Button("Encode")
+            # Example for encoding
+            encode_example = gr.Examples(
+                examples=["ગુજરાત અને ભારતમાં સ્થાન",
+                          "દેવજીની સરસવણી ગામમાં ખાસ કરીને આદિવાસી લોકો વસે છે",
+                          "મકાઈ, ઘઉં, ડાંગર, મગ, અડદ, અન્ય કઠોળ તેમ જ શાકભાજી આ ગામનાં મુખ્ય ખેત-ઉત્પાદનો છે.",
+                          "આ ગામમાં પ્રાથમિક શાળા, પંચાયતઘર, આંગણવાડી તેમ જ દૂધની ડેરી જેવી સવલતો પ્રાપ્ય થયેલી છે."],
+                inputs=Gujarati_text_input,
+                outputs=[token_ids_output, compression_ratio_output],
+                fn=encode_text_with_compression
+            )
+        with gr.Column():
+            gr.Markdown("### Decode Token IDs to Gujarati Text")
+            token_ids_input = gr.Textbox(
+                    label="Enter Token IDs (comma-separated or List)",
+                    placeholder="[2517, 2074, 340, 4, 201]",
+                    lines=4,
+                    key="decode_input"
+                )
+            decoded_text_output = gr.Textbox(label="Decoded Gujarati Text", interactive=False)
+            decode_button = gr.Button("Decode")
+    encode_button.click(
+        encode_text_with_compression,
+        inputs=Gujarati_text_input,
+        outputs=[token_ids_output, compression_ratio_output]
+    )
+    decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output)
+app.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+regex
+requests
+pandas
+tqdm

tokenizer.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import os
+import sys
+import glob
+import regex as re
+import pandas as pd
+import requests
+import unicodedata
+import json
+from collections import defaultdict, Counter
+from typing import List, Dict, Tuple, Set
+from tqdm import tqdm
+class GujaratiBPETokenizer:
+    def __init__(self, vocab_size: int = 5000):
+        self.vocab_size = vocab_size
+        self.vocab = {}
+        self.inverse_vocab = {}
+        self.compression_ratio = 0.
+        self.merges = {}
+        self.special_tokens = {
+            '<PAD>': 0,
+            '<UNK>': 1,
+            '<BOS>': 2,
+            '<EOS>': 3
+        }
+        # applies on the entire corpus
+        self.global_pattern = re.compile(r""" [\p{L}\p{M}\p{N}]+|[\p{L}\p{M}\p{N}]+|[^\r\n\p{L}\p{M}\p{N}]+""")
+        # applies on each words to separate morphpligical transformation ending with "ન" or "મ"
+        self.local_pattern = re.compile(r"""([\s\p{L}\p{M}]+|[\s\p{L}\p{M}\p{N}]+)([નમ](?:\p{M}))$""")
+        self.eng2guj = self.get_eng_to_guj_digits_mapping()
+        self.guj_unicode_df = self.get_guj_unicodes()
+        # Initialize basic Odia character vocabulary
+        self.base_vocab = set()
+        # Add basic Odia characters (vowels, consonants, marks)
+        self._initialize_base_vocab()
+    def get_guj_unicodes(self):
+        res = requests.get("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt")
+        lines = res.text.splitlines()
+        lines = [",".join(line.split(";")[:2]) for line in lines if "GUJARATI" in line]
+        data = {
+            "code": [l.split(",")[0] for l in lines],
+            "name": [l.split(",")[-1] for l in lines],
+            "char": [unicodedata.lookup(l.split(",")[1]) for l in lines],
+        }
+        df = pd.DataFrame(data)
+        return df
+    def _initialize_base_vocab(self):
+        """Initialize vocabulary with basic Odia characters"""
+        # Vowels
+        self.base_vocab.update(self.guj_unicode_df["char"].to_list())
+        # Whitespace characters with period.
+        self.base_vocab.update([' ', '\n', '\t', "."])
+    def _get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
+        """Count frequency of adjacent pairs in the vocabulary"""
+        pairs = defaultdict(int)
+        for word in words:
+            for i in range(len(word) - 1):
+                pairs[tuple(word[i:i + 2])] += 1
+        return pairs
+    def _merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
+        """Merge all occurrences of the most frequent pair"""
+        first, second = pair
+        new_words = []
+        for word in words:
+            i = 0
+            new_word = []
+            while i < len(word):
+                if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_words.append(new_word)
+        return new_words
+    def get_eng_to_guj_digits_mapping(self):
+        e2g = dict()
+        # Add digits 0 to 9
+        for i in range(10):
+            e2g[str(i)] = unicodedata.lookup(f"GUJARATI DIGIT {unicodedata.name(chr(48+i)).split()[-1]}")
+        return e2g
+    def remove_eng_words(self, text):
+        pat = re.compile(r"[a-zA-Z]+", re.IGNORECASE)
+        text = " ".join(re.sub(pat, "", text).split())
+        # text = re.sub(pat, "", text))
+        return text
+    def eng_to_guj_digits(self, text, e2g):
+        new_text = ""
+        for ch in text:
+            if ch.isdigit() and ch not in e2g.values():
+                new_text += e2g[ch]
+            else:
+                new_text += ch
+        return new_text
+    def process_text_with_regex(self, text):
+        split_text = re.findall(self.global_pattern, text)
+        new_text =[]
+        for t in split_text:
+            split_words = re.findall(self.local_pattern, t)
+            # print(f"word: {t} --> word split: {split_words}")
+            if split_words:
+                for item in split_words:
+                    if isinstance(item, tuple):
+                        w = [i for i in item if i != ""]
+                        # print(f"item: {item} --> {w}")
+                        new_text.extend(w)
+            else:
+                new_text.append(t)
+        return new_text
+    def tokenize_text(self, texts: List[str]):
+        """
+        Takes a list of text and provides list of processed words required for the encoding.
+        Args:
+            texts (List[str]): text lines
+        Returns:
+            list: list of extraced words from the text lines
+        """
+        processed_text = []
+        for t in tqdm(texts, desc="preprocessing", colour="green", bar_format="{l_bar}{bar:30}{r_bar}"):
+            processed_text.append(self.eng_to_guj_digits(self.remove_eng_words(t), self.eng2guj))
+        processed_text = " ".join(processed_text)
+        words = self.process_text_with_regex(processed_text)
+        return words
+    def train(self, texts: List[str], min_freq: int = 2) -> None:
+        """Train BPE model on texts"""
+        tokens = self.tokenize_text(texts)
+        words = tokens
+        vocab = self.base_vocab.copy()
+        num_merges = self.vocab_size - len(self.special_tokens) - len(vocab)
+        # print("num_merges : ", num_merges)
+        # Perform BPE merges
+        train_bar = tqdm(range(num_merges),
+                         desc="Merging pairs",
+                         total=num_merges,
+                         colour="blue",
+                         file=sys.stdout,
+                         bar_format="{l_bar}{bar:30}{r_bar}"
+        )
+        for i in train_bar:
+            pairs = self._get_stats(words)
+            if not pairs:
+                break
+            # Find most frequent pair
+            best_pair = max(pairs.items(), key=lambda x: x[1])
+            if best_pair[1] < min_freq:
+                break
+            pair = best_pair[0]
+            new_token = ''.join(pair)
+            vocab.add(new_token)
+            #print("merging ..", pair)
+            # print(len(vocab))
+            # Record the merge operation
+            self.merges[pair] = new_token
+            # Merge the pair in all words
+            words = self._merge_vocab(words, pair)
+        # Build final vocabulary
+        self.vocab = {**self.special_tokens}
+        idx = len(self.special_tokens)
+        for token in sorted(vocab):
+            self.vocab[token] = idx
+            idx += 1
+        self.inverse_vocab = {v: k for k, v in self.vocab.items()}
+        self.compression_ratio = len(tokens) / len(words)
+        print("tokens length:", len(tokens))
+        print("tokens length after merge operation:", len(words))
+        print(f"compression ratio: {len(tokens) / len(words):.2f}X")
+    def encode(self, text: str) -> List[int]:
+        """Encode text using learned BPE merges"""
+        # odia_word_pattern = re.compile(r""" ?[\u0B00-\u0B7F]+| ?[^\s]+|\s+(?!\S)|\s+""")
+        # extracted_words = odia_word_pattern.findall(text)
+        # words = [list(word) for word in extracted_words]
+        #words = [list(text)]
+        tokenized_words = self.tokenize_text([text])
+        words = [list(word) for word in tokenized_words]
+        # print("Before merges: ", words)
+        # Apply merges in order
+        for pair, merged in self.merges.items():
+            words = self._merge_vocab(words, pair)
+        # print("After mergers: ", words)
+        # Convert to token IDs
+        result = []
+        for word in words:
+            for token in word:
+                if token in self.vocab.keys():
+                    result.append(self.vocab[token])
+                else:
+                    result.append(self.special_tokens['<UNK>'])
+        return result
+    def decode(self, ids: List[int]) -> str:
+        """Decode token IDs back to text"""
+        return ''.join(self.inverse_vocab.get(id, '<UNK>') for id in ids)
+    def calculate_compression_ratio(self, text: str) -> float:
+        """Calculate compression ratio"""
+        encoded = self.encode(text)
+        return len(text) / len(encoded)
+    def save(self, path: str) -> None:
+        """Save tokenizer state"""
+        # Convert tuple keys to strings for JSON serialization
+        serializable_merges = {f"{first}|{second}": merged
+                              for (first, second), merged in self.merges.items()}
+        data = {
+            'vocab': self.vocab,
+            'merges': serializable_merges,
+            'vocab_size': self.vocab_size,
+            'special_tokens': self.special_tokens,
+            'compression_ratio': self.compression_ratio
+        }
+        with open(path, 'w', encoding='utf-8') as f:
+            json.dump(data, f, ensure_ascii=False, indent=2)
+    @classmethod
+    def load(cls, path: str) -> 'GujaratiBPETokenizer':
+        """Load tokenizer from file"""
+        with open(path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        tokenizer = cls(vocab_size=data['vocab_size'])
+        tokenizer.vocab = data['vocab']
+        # Convert string keys back to tuples
+        tokenizer.merges = {tuple(k.split('|')): v
+                           for k, v in data['merges'].items()}
+        tokenizer.special_tokens = data['special_tokens']
+        tokenizer.inverse_vocab = {v: k for k, v in tokenizer.vocab.items()}
+        tokenizer.compression_ratio = data['compression_ratio']
+        print(f"Tokenizer loaded!")
+        return tokenizer
+if __name__ == "__main__":
+    # train
+    data_path = os.path.join("data")
+    news_articles = glob.glob(os.path.join(data_path, "news dataset", "*.txt"))
+    cc100_dataset = glob.glob(os.path.join(data_path, "cc100-Gujarati", "*.txt"))
+    indic_dataset = glob.glob(os.path.join(data_path, "IndicCorp", "*.txt"))
+    final_dataset = news_articles + cc100_dataset + indic_dataset
+    texts = []
+    c = 0
+    for article in final_dataset:
+        with open(os.path.join(article), "r", encoding='utf-8') as f:
+            texts.append(f.readline().strip())
+    tokenizer = GujaratiBPETokenizer()
+    tokenizer.train(texts)
+    tokenizer.save(os.path.join("Gujarati_tokenizer.json"))
+    # # test
+    # tokenizer = GujaratiBPETokenizer().load("Gujarati_tokenizer.json")
+    # text1 = "ચામરાજનગર ભારત દેશના દક્ષિણ ભાગમાં આવેલા કર્ણાટક રાજ્યના ચામરાજનગર જિલ્લામાં આવેલું એક નગર છે. ચામરાજનગરમાં ચામરાજનગર જિલ્લાનું મુખ્યાલય છે."
+    # enc_text1 = tokenizer.encode(text1)
+    # print(enc_text1, len(enc_text1))
+    # text2 = tokenizer.decode(enc_text1)
+    # print(text2)
+    # assert text1 == text2, "Problem with BPE!!"