Spaces:

anjikum
/

BytePairEncode_V0

Sleeping

App Files Files Community

anjikum commited on Jan 13

Commit

6e778dd

verified ·

1 Parent(s): ea31548

Upload 4 files

Browse files

Files changed (4) hide show

app.py +112 -0
requirements.txt +1 -0
telugu_bpe.py +173 -0
telugu_bpe_model.json +0 -0

app.py ADDED Viewed

	@@ -0,0 +1,112 @@

+import gradio as gr
+from telugu_bpe import TeluguBPE
+import os
+# Initialize the BPE model
+bpe = TeluguBPE(vocab_size=5000)
+# Get the absolute path to the model file
+current_dir = os.path.dirname(os.path.abspath(__file__))
+model_path = os.path.join(current_dir, "telugu_bpe_model.json")
+# Load the pre-trained model
+try:
+    bpe.load_model(model_path)
+    print("Model loaded successfully!")
+except FileNotFoundError:
+    print(f"Error: Model file not found at {model_path}")
+    # Train a small model with sample text if model doesn't exist
+    sample_text = """
+    నమస్కారం తెలుగు భాష చాలా అందమైన భాష
+    తెలుగు భారతదేశంలోని ద్రావిడ భాషల్లో ఒకటి
+    తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి
+    """
+    processed_text = bpe.preprocess_telugu_text(sample_text)
+    bpe.learn_bpe(processed_text)
+    bpe.save_model(model_path)
+    print("Created a new model with sample text")
+def process_text(input_text: str) -> dict:
+    """
+    Process input Telugu text and return tokenization results
+    """
+    if not input_text or input_text.strip() == "":
+        return {
+            "Error": "Please enter some Telugu text"
+        }
+    try:
+        # Preprocess the input text
+        processed_text = bpe.preprocess_telugu_text(input_text)
+        # Encode the text
+        encoded_tokens = bpe.encode(processed_text)
+        # Calculate statistics
+        char_count = len(processed_text)
+        token_count = len(encoded_tokens)
+        compression_ratio = char_count / token_count if token_count > 0 else 0
+        return {
+            "Preprocessed Text": processed_text,
+            "Tokens": encoded_tokens,
+            "Character Count": char_count,
+            "Token Count": token_count,
+            "Compression Ratio": f"{compression_ratio:.2f}x",
+            "Vocabulary Size": len(bpe.vocab)
+        }
+    except Exception as e:
+        return {
+            "Error": f"An error occurred: {str(e)}"
+        }
+# Create Gradio interface
+demo = gr.Interface(
+    fn=process_text,
+    inputs=[
+        gr.Textbox(
+            lines=4,
+            placeholder="Enter Telugu text here...",
+            label="Input Telugu Text",
+            value="నమస్కారం"
+        )
+    ],
+    outputs=gr.JSON(label="Tokenization Results"),
+    title="Telugu BPE Tokenizer",
+    description="""
+    ## Telugu Byte Pair Encoding (BPE) Tokenizer
+    This tokenizer is specifically designed for Telugu text processing with a vocabulary size of ~5000 tokens.
+    ### Features:
+    - Telugu-specific preprocessing
+    - BPE tokenization
+    - Compression statistics
+    - Character and token counts
+    ### How to use:
+    1. Enter Telugu text in the input box
+    2. Get tokenized output and statistics
+    ### Example inputs provided below ⬇️
+    """,
+    examples=[
+        ["నమస్కారం"],
+        ["తెలుగు భాష చాలా అందమైన భాష"],
+        ["నేను తెలుగులో మాట్లాడగలను"],
+        ["తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి"]
+    ],
+    theme=gr.themes.Soft(),
+    allow_flagging="never",
+    cache_examples=True
+)
+# Launch with Hugging Face Space configurations
+if __name__ == "__main__":
+    demo.launch(
+        share=False,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True,
+        enable_queue=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

telugu_bpe.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import re
+import collections
+from typing import Dict, List, Tuple, Set
+import json
+from pathlib import Path
+class TeluguBPE:
+    def __init__(self, vocab_size: int = 5000):
+        self.vocab_size = vocab_size
+        self.merges: Dict[Tuple[str, str], str] = {}
+        self.vocab: Set[str] = set()
+    def preprocess_telugu_text(self, text: str) -> str:
+        """
+        Preprocess Telugu text with specific rules
+        """
+        # Remove any ASCII characters except spaces and newlines
+        text = re.sub(r'[^\u0C00-\u0C7F\s\n]', '', text)
+        # Normalize spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Add spaces between Telugu characters and numbers
+        text = re.sub(r'(\d+)', r' \1 ', text)
+        # Add spaces between Telugu punctuation marks
+        text = re.sub(r'([।॥,?!])', r' \1 ', text)
+        # Handle Telugu specific patterns
+        # Add space after purna virama (full stop)
+        text = re.sub(r'([।॥])', r'\1 ', text)
+        # Separate combined vowel marks
+        text = re.sub(r'([\u0C3E-\u0C4C])', r' \1', text)
+        return text.strip()
+    def get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
+        """
+        Count frequency of adjacent pairs in current vocabulary
+        """
+        pairs = collections.defaultdict(int)
+        for word in words:
+            for i in range(len(word) - 1):
+                pairs[tuple(word[i:i + 2])] += 1
+        return pairs
+    def merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
+        """
+        Merge all occurrences of the most frequent pair
+        """
+        first, second = pair
+        new_words = []
+        for word in words:
+            i = 0
+            new_word = []
+            while i < len(word):
+                if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_words.append(new_word)
+        return new_words
+    def learn_bpe(self, text: str) -> None:
+        """
+        Learn BPE merges from text
+        """
+        # Initial vocabulary: character level
+        words = [[char for char in word] for word in text.split()]
+        self.vocab = set(char for word in words for char in word)
+        num_merges = self.vocab_size - len(self.vocab)
+        for i in range(num_merges):
+            pairs = self.get_stats(words)
+            if not pairs:
+                break
+            best_pair = max(pairs.items(), key=lambda x: x[1])[0]
+            self.merges[best_pair] = best_pair[0] + best_pair[1]
+            self.vocab.add(self.merges[best_pair])
+            words = self.merge_vocab(words, best_pair)
+            if len(self.vocab) >= self.vocab_size:
+                break
+    def encode(self, text: str) -> List[str]:
+        """
+        Encode text using learned BPE merges
+        """
+        words = [[char for char in word] for word in text.split()]
+        for pair, merge in self.merges.items():
+            words = self.merge_vocab(words, pair)
+        return [token for word in words for token in word]
+    def save_model(self, path: str) -> None:
+        """
+        Save BPE model to file
+        """
+        model_data = {
+            'vocab_size': self.vocab_size,
+            'merges': {f'{k[0]} {k[1]}': v for k, v in self.merges.items()},
+            'vocab': list(self.vocab)
+        }
+        with open(path, 'w', encoding='utf-8') as f:
+            json.dump(model_data, f, ensure_ascii=False, indent=2)
+    def load_model(self, path: str) -> None:
+        """
+        Load BPE model from file
+        """
+        with open(path, 'r', encoding='utf-8') as f:
+            model_data = json.load(f)
+        self.vocab_size = model_data['vocab_size']
+        self.merges = {tuple(k.split()): v for k, v in model_data['merges'].items()}
+        self.vocab = set(model_data['vocab'])
+def main():
+    # Example usage
+    input_file = "telugu_text.txt"
+    model_file = "telugu_bpe_model.json"
+    # Read input text
+    with open(input_file, 'r', encoding='utf-8') as f:
+        text = f.read()
+    print(f'Started learning BPE')
+    bpe = TeluguBPE(vocab_size=5000)
+    # Preprocess text
+    processed_text = bpe.preprocess_telugu_text(text)
+    # Calculate original text statistics
+    original_chars = len(processed_text)
+    original_tokens = len(processed_text.split())
+    # Learn BPE
+    bpe.learn_bpe(processed_text)
+    # Encode the entire text to calculate compression
+    encoded_text = bpe.encode(processed_text)
+    encoded_length = len(encoded_text)
+    # Calculate compression ratio
+    compression_ratio = original_chars / encoded_length
+    # Save model
+    bpe.save_model(model_file)
+    # Print statistics
+    print(f"\nCompression Statistics:")
+    print(f"Original characters: {original_chars}")
+    print(f"Original tokens (words): {original_tokens}")
+    print(f"Encoded tokens: {encoded_length}")
+    print(f"Compression ratio: {compression_ratio:.2f}x")
+    print(f"Vocabulary size: {len(bpe.vocab)}")
+    # Example encoding
+    sample_text = "నమస్కారం"  # "Hello" in Telugu
+    encoded = bpe.encode(bpe.preprocess_telugu_text(sample_text))
+    print(f"\nExample encoding:")
+    print(f"Sample text: {sample_text}")
+    print(f"Encoded text: {encoded}")
+if __name__ == "__main__":
+    main()

telugu_bpe_model.json ADDED Viewed

The diff for this file is too large to render. See raw diff