Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app.py +112 -0
- requirements.txt +1 -0
- telugu_bpe.py +173 -0
- telugu_bpe_model.json +0 -0
app.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from telugu_bpe import TeluguBPE
|
3 |
+
import os
|
4 |
+
|
5 |
+
# Initialize the BPE model
|
6 |
+
bpe = TeluguBPE(vocab_size=5000)
|
7 |
+
|
8 |
+
# Get the absolute path to the model file
|
9 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
10 |
+
model_path = os.path.join(current_dir, "telugu_bpe_model.json")
|
11 |
+
|
12 |
+
# Load the pre-trained model
|
13 |
+
try:
|
14 |
+
bpe.load_model(model_path)
|
15 |
+
print("Model loaded successfully!")
|
16 |
+
except FileNotFoundError:
|
17 |
+
print(f"Error: Model file not found at {model_path}")
|
18 |
+
# Train a small model with sample text if model doesn't exist
|
19 |
+
sample_text = """
|
20 |
+
నమస్కారం తెలుగు భాష చాలా అందమైన భాష
|
21 |
+
తెలుగు భారతదేశంలోని ద్రావిడ భాషల్లో ఒకటి
|
22 |
+
తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి
|
23 |
+
"""
|
24 |
+
processed_text = bpe.preprocess_telugu_text(sample_text)
|
25 |
+
bpe.learn_bpe(processed_text)
|
26 |
+
bpe.save_model(model_path)
|
27 |
+
print("Created a new model with sample text")
|
28 |
+
|
29 |
+
def process_text(input_text: str) -> dict:
|
30 |
+
"""
|
31 |
+
Process input Telugu text and return tokenization results
|
32 |
+
"""
|
33 |
+
if not input_text or input_text.strip() == "":
|
34 |
+
return {
|
35 |
+
"Error": "Please enter some Telugu text"
|
36 |
+
}
|
37 |
+
|
38 |
+
try:
|
39 |
+
# Preprocess the input text
|
40 |
+
processed_text = bpe.preprocess_telugu_text(input_text)
|
41 |
+
|
42 |
+
# Encode the text
|
43 |
+
encoded_tokens = bpe.encode(processed_text)
|
44 |
+
|
45 |
+
# Calculate statistics
|
46 |
+
char_count = len(processed_text)
|
47 |
+
token_count = len(encoded_tokens)
|
48 |
+
compression_ratio = char_count / token_count if token_count > 0 else 0
|
49 |
+
|
50 |
+
return {
|
51 |
+
"Preprocessed Text": processed_text,
|
52 |
+
"Tokens": encoded_tokens,
|
53 |
+
"Character Count": char_count,
|
54 |
+
"Token Count": token_count,
|
55 |
+
"Compression Ratio": f"{compression_ratio:.2f}x",
|
56 |
+
"Vocabulary Size": len(bpe.vocab)
|
57 |
+
}
|
58 |
+
except Exception as e:
|
59 |
+
return {
|
60 |
+
"Error": f"An error occurred: {str(e)}"
|
61 |
+
}
|
62 |
+
|
63 |
+
# Create Gradio interface
|
64 |
+
demo = gr.Interface(
|
65 |
+
fn=process_text,
|
66 |
+
inputs=[
|
67 |
+
gr.Textbox(
|
68 |
+
lines=4,
|
69 |
+
placeholder="Enter Telugu text here...",
|
70 |
+
label="Input Telugu Text",
|
71 |
+
value="నమస్కారం"
|
72 |
+
)
|
73 |
+
],
|
74 |
+
outputs=gr.JSON(label="Tokenization Results"),
|
75 |
+
title="Telugu BPE Tokenizer",
|
76 |
+
description="""
|
77 |
+
## Telugu Byte Pair Encoding (BPE) Tokenizer
|
78 |
+
|
79 |
+
This tokenizer is specifically designed for Telugu text processing with a vocabulary size of ~5000 tokens.
|
80 |
+
|
81 |
+
### Features:
|
82 |
+
- Telugu-specific preprocessing
|
83 |
+
- BPE tokenization
|
84 |
+
- Compression statistics
|
85 |
+
- Character and token counts
|
86 |
+
|
87 |
+
### How to use:
|
88 |
+
1. Enter Telugu text in the input box
|
89 |
+
2. Get tokenized output and statistics
|
90 |
+
|
91 |
+
### Example inputs provided below ⬇️
|
92 |
+
""",
|
93 |
+
examples=[
|
94 |
+
["నమస్కారం"],
|
95 |
+
["తెలుగు భాష చాలా అందమైన భాష"],
|
96 |
+
["నేను తెలుగులో మాట్లాడగలను"],
|
97 |
+
["తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి"]
|
98 |
+
],
|
99 |
+
theme=gr.themes.Soft(),
|
100 |
+
allow_flagging="never",
|
101 |
+
cache_examples=True
|
102 |
+
)
|
103 |
+
|
104 |
+
# Launch with Hugging Face Space configurations
|
105 |
+
if __name__ == "__main__":
|
106 |
+
demo.launch(
|
107 |
+
share=False,
|
108 |
+
server_name="0.0.0.0",
|
109 |
+
server_port=7860,
|
110 |
+
show_error=True,
|
111 |
+
enable_queue=True
|
112 |
+
)
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
telugu_bpe.py
ADDED
@@ -0,0 +1,173 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import collections
|
3 |
+
from typing import Dict, List, Tuple, Set
|
4 |
+
import json
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
class TeluguBPE:
|
8 |
+
def __init__(self, vocab_size: int = 5000):
|
9 |
+
self.vocab_size = vocab_size
|
10 |
+
self.merges: Dict[Tuple[str, str], str] = {}
|
11 |
+
self.vocab: Set[str] = set()
|
12 |
+
|
13 |
+
def preprocess_telugu_text(self, text: str) -> str:
|
14 |
+
"""
|
15 |
+
Preprocess Telugu text with specific rules
|
16 |
+
"""
|
17 |
+
# Remove any ASCII characters except spaces and newlines
|
18 |
+
text = re.sub(r'[^\u0C00-\u0C7F\s\n]', '', text)
|
19 |
+
|
20 |
+
# Normalize spaces
|
21 |
+
text = re.sub(r'\s+', ' ', text)
|
22 |
+
|
23 |
+
# Add spaces between Telugu characters and numbers
|
24 |
+
text = re.sub(r'(\d+)', r' \1 ', text)
|
25 |
+
|
26 |
+
# Add spaces between Telugu punctuation marks
|
27 |
+
text = re.sub(r'([।॥,?!])', r' \1 ', text)
|
28 |
+
|
29 |
+
# Handle Telugu specific patterns
|
30 |
+
# Add space after purna virama (full stop)
|
31 |
+
text = re.sub(r'([।॥])', r'\1 ', text)
|
32 |
+
|
33 |
+
# Separate combined vowel marks
|
34 |
+
text = re.sub(r'([\u0C3E-\u0C4C])', r' \1', text)
|
35 |
+
|
36 |
+
return text.strip()
|
37 |
+
|
38 |
+
def get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
|
39 |
+
"""
|
40 |
+
Count frequency of adjacent pairs in current vocabulary
|
41 |
+
"""
|
42 |
+
pairs = collections.defaultdict(int)
|
43 |
+
for word in words:
|
44 |
+
for i in range(len(word) - 1):
|
45 |
+
pairs[tuple(word[i:i + 2])] += 1
|
46 |
+
return pairs
|
47 |
+
|
48 |
+
def merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
|
49 |
+
"""
|
50 |
+
Merge all occurrences of the most frequent pair
|
51 |
+
"""
|
52 |
+
first, second = pair
|
53 |
+
new_words = []
|
54 |
+
|
55 |
+
for word in words:
|
56 |
+
i = 0
|
57 |
+
new_word = []
|
58 |
+
while i < len(word):
|
59 |
+
if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
|
60 |
+
new_word.append(first + second)
|
61 |
+
i += 2
|
62 |
+
else:
|
63 |
+
new_word.append(word[i])
|
64 |
+
i += 1
|
65 |
+
new_words.append(new_word)
|
66 |
+
|
67 |
+
return new_words
|
68 |
+
|
69 |
+
def learn_bpe(self, text: str) -> None:
|
70 |
+
"""
|
71 |
+
Learn BPE merges from text
|
72 |
+
"""
|
73 |
+
# Initial vocabulary: character level
|
74 |
+
words = [[char for char in word] for word in text.split()]
|
75 |
+
self.vocab = set(char for word in words for char in word)
|
76 |
+
|
77 |
+
num_merges = self.vocab_size - len(self.vocab)
|
78 |
+
|
79 |
+
for i in range(num_merges):
|
80 |
+
pairs = self.get_stats(words)
|
81 |
+
if not pairs:
|
82 |
+
break
|
83 |
+
|
84 |
+
best_pair = max(pairs.items(), key=lambda x: x[1])[0]
|
85 |
+
self.merges[best_pair] = best_pair[0] + best_pair[1]
|
86 |
+
self.vocab.add(self.merges[best_pair])
|
87 |
+
|
88 |
+
words = self.merge_vocab(words, best_pair)
|
89 |
+
|
90 |
+
if len(self.vocab) >= self.vocab_size:
|
91 |
+
break
|
92 |
+
|
93 |
+
def encode(self, text: str) -> List[str]:
|
94 |
+
"""
|
95 |
+
Encode text using learned BPE merges
|
96 |
+
"""
|
97 |
+
words = [[char for char in word] for word in text.split()]
|
98 |
+
for pair, merge in self.merges.items():
|
99 |
+
words = self.merge_vocab(words, pair)
|
100 |
+
return [token for word in words for token in word]
|
101 |
+
|
102 |
+
def save_model(self, path: str) -> None:
|
103 |
+
"""
|
104 |
+
Save BPE model to file
|
105 |
+
"""
|
106 |
+
model_data = {
|
107 |
+
'vocab_size': self.vocab_size,
|
108 |
+
'merges': {f'{k[0]} {k[1]}': v for k, v in self.merges.items()},
|
109 |
+
'vocab': list(self.vocab)
|
110 |
+
}
|
111 |
+
with open(path, 'w', encoding='utf-8') as f:
|
112 |
+
json.dump(model_data, f, ensure_ascii=False, indent=2)
|
113 |
+
|
114 |
+
def load_model(self, path: str) -> None:
|
115 |
+
"""
|
116 |
+
Load BPE model from file
|
117 |
+
"""
|
118 |
+
with open(path, 'r', encoding='utf-8') as f:
|
119 |
+
model_data = json.load(f)
|
120 |
+
|
121 |
+
self.vocab_size = model_data['vocab_size']
|
122 |
+
self.merges = {tuple(k.split()): v for k, v in model_data['merges'].items()}
|
123 |
+
self.vocab = set(model_data['vocab'])
|
124 |
+
|
125 |
+
def main():
|
126 |
+
# Example usage
|
127 |
+
input_file = "telugu_text.txt"
|
128 |
+
model_file = "telugu_bpe_model.json"
|
129 |
+
|
130 |
+
# Read input text
|
131 |
+
with open(input_file, 'r', encoding='utf-8') as f:
|
132 |
+
text = f.read()
|
133 |
+
|
134 |
+
print(f'Started learning BPE')
|
135 |
+
bpe = TeluguBPE(vocab_size=5000)
|
136 |
+
|
137 |
+
# Preprocess text
|
138 |
+
processed_text = bpe.preprocess_telugu_text(text)
|
139 |
+
|
140 |
+
# Calculate original text statistics
|
141 |
+
original_chars = len(processed_text)
|
142 |
+
original_tokens = len(processed_text.split())
|
143 |
+
|
144 |
+
# Learn BPE
|
145 |
+
bpe.learn_bpe(processed_text)
|
146 |
+
|
147 |
+
# Encode the entire text to calculate compression
|
148 |
+
encoded_text = bpe.encode(processed_text)
|
149 |
+
encoded_length = len(encoded_text)
|
150 |
+
|
151 |
+
# Calculate compression ratio
|
152 |
+
compression_ratio = original_chars / encoded_length
|
153 |
+
|
154 |
+
# Save model
|
155 |
+
bpe.save_model(model_file)
|
156 |
+
|
157 |
+
# Print statistics
|
158 |
+
print(f"\nCompression Statistics:")
|
159 |
+
print(f"Original characters: {original_chars}")
|
160 |
+
print(f"Original tokens (words): {original_tokens}")
|
161 |
+
print(f"Encoded tokens: {encoded_length}")
|
162 |
+
print(f"Compression ratio: {compression_ratio:.2f}x")
|
163 |
+
print(f"Vocabulary size: {len(bpe.vocab)}")
|
164 |
+
|
165 |
+
# Example encoding
|
166 |
+
sample_text = "నమస్కారం" # "Hello" in Telugu
|
167 |
+
encoded = bpe.encode(bpe.preprocess_telugu_text(sample_text))
|
168 |
+
print(f"\nExample encoding:")
|
169 |
+
print(f"Sample text: {sample_text}")
|
170 |
+
print(f"Encoded text: {encoded}")
|
171 |
+
|
172 |
+
if __name__ == "__main__":
|
173 |
+
main()
|
telugu_bpe_model.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|