anjikum commited on
Commit
6e778dd
·
verified ·
1 Parent(s): ea31548

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +112 -0
  2. requirements.txt +1 -0
  3. telugu_bpe.py +173 -0
  4. telugu_bpe_model.json +0 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from telugu_bpe import TeluguBPE
3
+ import os
4
+
5
+ # Initialize the BPE model
6
+ bpe = TeluguBPE(vocab_size=5000)
7
+
8
+ # Get the absolute path to the model file
9
+ current_dir = os.path.dirname(os.path.abspath(__file__))
10
+ model_path = os.path.join(current_dir, "telugu_bpe_model.json")
11
+
12
+ # Load the pre-trained model
13
+ try:
14
+ bpe.load_model(model_path)
15
+ print("Model loaded successfully!")
16
+ except FileNotFoundError:
17
+ print(f"Error: Model file not found at {model_path}")
18
+ # Train a small model with sample text if model doesn't exist
19
+ sample_text = """
20
+ నమస్కారం తెలుగు భాష చాలా అందమైన భాష
21
+ తెలుగు భారతదేశంలోని ద్రావిడ భాషల్లో ఒకటి
22
+ తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి
23
+ """
24
+ processed_text = bpe.preprocess_telugu_text(sample_text)
25
+ bpe.learn_bpe(processed_text)
26
+ bpe.save_model(model_path)
27
+ print("Created a new model with sample text")
28
+
29
+ def process_text(input_text: str) -> dict:
30
+ """
31
+ Process input Telugu text and return tokenization results
32
+ """
33
+ if not input_text or input_text.strip() == "":
34
+ return {
35
+ "Error": "Please enter some Telugu text"
36
+ }
37
+
38
+ try:
39
+ # Preprocess the input text
40
+ processed_text = bpe.preprocess_telugu_text(input_text)
41
+
42
+ # Encode the text
43
+ encoded_tokens = bpe.encode(processed_text)
44
+
45
+ # Calculate statistics
46
+ char_count = len(processed_text)
47
+ token_count = len(encoded_tokens)
48
+ compression_ratio = char_count / token_count if token_count > 0 else 0
49
+
50
+ return {
51
+ "Preprocessed Text": processed_text,
52
+ "Tokens": encoded_tokens,
53
+ "Character Count": char_count,
54
+ "Token Count": token_count,
55
+ "Compression Ratio": f"{compression_ratio:.2f}x",
56
+ "Vocabulary Size": len(bpe.vocab)
57
+ }
58
+ except Exception as e:
59
+ return {
60
+ "Error": f"An error occurred: {str(e)}"
61
+ }
62
+
63
+ # Create Gradio interface
64
+ demo = gr.Interface(
65
+ fn=process_text,
66
+ inputs=[
67
+ gr.Textbox(
68
+ lines=4,
69
+ placeholder="Enter Telugu text here...",
70
+ label="Input Telugu Text",
71
+ value="నమస్కారం"
72
+ )
73
+ ],
74
+ outputs=gr.JSON(label="Tokenization Results"),
75
+ title="Telugu BPE Tokenizer",
76
+ description="""
77
+ ## Telugu Byte Pair Encoding (BPE) Tokenizer
78
+
79
+ This tokenizer is specifically designed for Telugu text processing with a vocabulary size of ~5000 tokens.
80
+
81
+ ### Features:
82
+ - Telugu-specific preprocessing
83
+ - BPE tokenization
84
+ - Compression statistics
85
+ - Character and token counts
86
+
87
+ ### How to use:
88
+ 1. Enter Telugu text in the input box
89
+ 2. Get tokenized output and statistics
90
+
91
+ ### Example inputs provided below ⬇️
92
+ """,
93
+ examples=[
94
+ ["నమస్కారం"],
95
+ ["తెలుగు భాష చాలా అందమైన భాష"],
96
+ ["నేను తెలుగులో మాట్లాడగలను"],
97
+ ["తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి"]
98
+ ],
99
+ theme=gr.themes.Soft(),
100
+ allow_flagging="never",
101
+ cache_examples=True
102
+ )
103
+
104
+ # Launch with Hugging Face Space configurations
105
+ if __name__ == "__main__":
106
+ demo.launch(
107
+ share=False,
108
+ server_name="0.0.0.0",
109
+ server_port=7860,
110
+ show_error=True,
111
+ enable_queue=True
112
+ )
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+
telugu_bpe.py ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import collections
3
+ from typing import Dict, List, Tuple, Set
4
+ import json
5
+ from pathlib import Path
6
+
7
+ class TeluguBPE:
8
+ def __init__(self, vocab_size: int = 5000):
9
+ self.vocab_size = vocab_size
10
+ self.merges: Dict[Tuple[str, str], str] = {}
11
+ self.vocab: Set[str] = set()
12
+
13
+ def preprocess_telugu_text(self, text: str) -> str:
14
+ """
15
+ Preprocess Telugu text with specific rules
16
+ """
17
+ # Remove any ASCII characters except spaces and newlines
18
+ text = re.sub(r'[^\u0C00-\u0C7F\s\n]', '', text)
19
+
20
+ # Normalize spaces
21
+ text = re.sub(r'\s+', ' ', text)
22
+
23
+ # Add spaces between Telugu characters and numbers
24
+ text = re.sub(r'(\d+)', r' \1 ', text)
25
+
26
+ # Add spaces between Telugu punctuation marks
27
+ text = re.sub(r'([।॥,?!])', r' \1 ', text)
28
+
29
+ # Handle Telugu specific patterns
30
+ # Add space after purna virama (full stop)
31
+ text = re.sub(r'([।॥])', r'\1 ', text)
32
+
33
+ # Separate combined vowel marks
34
+ text = re.sub(r'([\u0C3E-\u0C4C])', r' \1', text)
35
+
36
+ return text.strip()
37
+
38
+ def get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
39
+ """
40
+ Count frequency of adjacent pairs in current vocabulary
41
+ """
42
+ pairs = collections.defaultdict(int)
43
+ for word in words:
44
+ for i in range(len(word) - 1):
45
+ pairs[tuple(word[i:i + 2])] += 1
46
+ return pairs
47
+
48
+ def merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
49
+ """
50
+ Merge all occurrences of the most frequent pair
51
+ """
52
+ first, second = pair
53
+ new_words = []
54
+
55
+ for word in words:
56
+ i = 0
57
+ new_word = []
58
+ while i < len(word):
59
+ if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
60
+ new_word.append(first + second)
61
+ i += 2
62
+ else:
63
+ new_word.append(word[i])
64
+ i += 1
65
+ new_words.append(new_word)
66
+
67
+ return new_words
68
+
69
+ def learn_bpe(self, text: str) -> None:
70
+ """
71
+ Learn BPE merges from text
72
+ """
73
+ # Initial vocabulary: character level
74
+ words = [[char for char in word] for word in text.split()]
75
+ self.vocab = set(char for word in words for char in word)
76
+
77
+ num_merges = self.vocab_size - len(self.vocab)
78
+
79
+ for i in range(num_merges):
80
+ pairs = self.get_stats(words)
81
+ if not pairs:
82
+ break
83
+
84
+ best_pair = max(pairs.items(), key=lambda x: x[1])[0]
85
+ self.merges[best_pair] = best_pair[0] + best_pair[1]
86
+ self.vocab.add(self.merges[best_pair])
87
+
88
+ words = self.merge_vocab(words, best_pair)
89
+
90
+ if len(self.vocab) >= self.vocab_size:
91
+ break
92
+
93
+ def encode(self, text: str) -> List[str]:
94
+ """
95
+ Encode text using learned BPE merges
96
+ """
97
+ words = [[char for char in word] for word in text.split()]
98
+ for pair, merge in self.merges.items():
99
+ words = self.merge_vocab(words, pair)
100
+ return [token for word in words for token in word]
101
+
102
+ def save_model(self, path: str) -> None:
103
+ """
104
+ Save BPE model to file
105
+ """
106
+ model_data = {
107
+ 'vocab_size': self.vocab_size,
108
+ 'merges': {f'{k[0]} {k[1]}': v for k, v in self.merges.items()},
109
+ 'vocab': list(self.vocab)
110
+ }
111
+ with open(path, 'w', encoding='utf-8') as f:
112
+ json.dump(model_data, f, ensure_ascii=False, indent=2)
113
+
114
+ def load_model(self, path: str) -> None:
115
+ """
116
+ Load BPE model from file
117
+ """
118
+ with open(path, 'r', encoding='utf-8') as f:
119
+ model_data = json.load(f)
120
+
121
+ self.vocab_size = model_data['vocab_size']
122
+ self.merges = {tuple(k.split()): v for k, v in model_data['merges'].items()}
123
+ self.vocab = set(model_data['vocab'])
124
+
125
+ def main():
126
+ # Example usage
127
+ input_file = "telugu_text.txt"
128
+ model_file = "telugu_bpe_model.json"
129
+
130
+ # Read input text
131
+ with open(input_file, 'r', encoding='utf-8') as f:
132
+ text = f.read()
133
+
134
+ print(f'Started learning BPE')
135
+ bpe = TeluguBPE(vocab_size=5000)
136
+
137
+ # Preprocess text
138
+ processed_text = bpe.preprocess_telugu_text(text)
139
+
140
+ # Calculate original text statistics
141
+ original_chars = len(processed_text)
142
+ original_tokens = len(processed_text.split())
143
+
144
+ # Learn BPE
145
+ bpe.learn_bpe(processed_text)
146
+
147
+ # Encode the entire text to calculate compression
148
+ encoded_text = bpe.encode(processed_text)
149
+ encoded_length = len(encoded_text)
150
+
151
+ # Calculate compression ratio
152
+ compression_ratio = original_chars / encoded_length
153
+
154
+ # Save model
155
+ bpe.save_model(model_file)
156
+
157
+ # Print statistics
158
+ print(f"\nCompression Statistics:")
159
+ print(f"Original characters: {original_chars}")
160
+ print(f"Original tokens (words): {original_tokens}")
161
+ print(f"Encoded tokens: {encoded_length}")
162
+ print(f"Compression ratio: {compression_ratio:.2f}x")
163
+ print(f"Vocabulary size: {len(bpe.vocab)}")
164
+
165
+ # Example encoding
166
+ sample_text = "నమస్కారం" # "Hello" in Telugu
167
+ encoded = bpe.encode(bpe.preprocess_telugu_text(sample_text))
168
+ print(f"\nExample encoding:")
169
+ print(f"Sample text: {sample_text}")
170
+ print(f"Encoded text: {encoded}")
171
+
172
+ if __name__ == "__main__":
173
+ main()
telugu_bpe_model.json ADDED
The diff for this file is too large to render. See raw diff