Spaces:

anjikum
/

BytePairEncode_V0

Sleeping

File size: 5,840 Bytes

6e778dd

import re
import collections
from typing import Dict, List, Tuple, Set
import json
from pathlib import Path

class TeluguBPE:
    def __init__(self, vocab_size: int = 5000):
        self.vocab_size = vocab_size
        self.merges: Dict[Tuple[str, str], str] = {}
        self.vocab: Set[str] = set()
        
    def preprocess_telugu_text(self, text: str) -> str:
        """

        Preprocess Telugu text with specific rules

        """
        # Remove any ASCII characters except spaces and newlines
        text = re.sub(r'[^\u0C00-\u0C7F\s\n]', '', text)
        
        # Normalize spaces
        text = re.sub(r'\s+', ' ', text)
        
        # Add spaces between Telugu characters and numbers
        text = re.sub(r'(\d+)', r' \1 ', text)
        
        # Add spaces between Telugu punctuation marks
        text = re.sub(r'([।॥,?!])', r' \1 ', text)
        
        # Handle Telugu specific patterns
        # Add space after purna virama (full stop)
        text = re.sub(r'([।॥])', r'\1 ', text)
        
        # Separate combined vowel marks
        text = re.sub(r'([\u0C3E-\u0C4C])', r' \1', text)
        
        return text.strip()

    def get_stats(self, words: List[List[str]]) -> Dict[Tuple[str, str], int]:
        """

        Count frequency of adjacent pairs in current vocabulary

        """
        pairs = collections.defaultdict(int)
        for word in words:
            for i in range(len(word) - 1):
                pairs[tuple(word[i:i + 2])] += 1
        return pairs

    def merge_vocab(self, words: List[List[str]], pair: Tuple[str, str]) -> List[List[str]]:
        """

        Merge all occurrences of the most frequent pair

        """
        first, second = pair
        new_words = []
        
        for word in words:
            i = 0
            new_word = []
            while i < len(word):
                if i < len(word) - 1 and word[i] == first and word[i + 1] == second:
                    new_word.append(first + second)
                    i += 2
                else:
                    new_word.append(word[i])
                    i += 1
            new_words.append(new_word)
            
        return new_words

    def learn_bpe(self, text: str) -> None:
        """

        Learn BPE merges from text

        """
        # Initial vocabulary: character level
        words = [[char for char in word] for word in text.split()]
        self.vocab = set(char for word in words for char in word)
        
        num_merges = self.vocab_size - len(self.vocab)
        
        for i in range(num_merges):
            pairs = self.get_stats(words)
            if not pairs:
                break
                
            best_pair = max(pairs.items(), key=lambda x: x[1])[0]
            self.merges[best_pair] = best_pair[0] + best_pair[1]
            self.vocab.add(self.merges[best_pair])
            
            words = self.merge_vocab(words, best_pair)
            
            if len(self.vocab) >= self.vocab_size:
                break

    def encode(self, text: str) -> List[str]:
        """

        Encode text using learned BPE merges

        """
        words = [[char for char in word] for word in text.split()]
        for pair, merge in self.merges.items():
            words = self.merge_vocab(words, pair)
        return [token for word in words for token in word]

    def save_model(self, path: str) -> None:
        """

        Save BPE model to file

        """
        model_data = {
            'vocab_size': self.vocab_size,
            'merges': {f'{k[0]} {k[1]}': v for k, v in self.merges.items()},
            'vocab': list(self.vocab)
        }
        with open(path, 'w', encoding='utf-8') as f:
            json.dump(model_data, f, ensure_ascii=False, indent=2)

    def load_model(self, path: str) -> None:
        """

        Load BPE model from file

        """
        with open(path, 'r', encoding='utf-8') as f:
            model_data = json.load(f)
        
        self.vocab_size = model_data['vocab_size']
        self.merges = {tuple(k.split()): v for k, v in model_data['merges'].items()}
        self.vocab = set(model_data['vocab'])

def main():
    # Example usage
    input_file = "telugu_text.txt"
    model_file = "telugu_bpe_model.json"
    
    # Read input text
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()
    
    print(f'Started learning BPE')
    bpe = TeluguBPE(vocab_size=5000)
    
    # Preprocess text
    processed_text = bpe.preprocess_telugu_text(text)
    
    # Calculate original text statistics
    original_chars = len(processed_text)
    original_tokens = len(processed_text.split())
    
    # Learn BPE
    bpe.learn_bpe(processed_text)
    
    # Encode the entire text to calculate compression
    encoded_text = bpe.encode(processed_text)
    encoded_length = len(encoded_text)
    
    # Calculate compression ratio
    compression_ratio = original_chars / encoded_length
    
    # Save model
    bpe.save_model(model_file)
    
    # Print statistics
    print(f"\nCompression Statistics:")
    print(f"Original characters: {original_chars}")
    print(f"Original tokens (words): {original_tokens}")
    print(f"Encoded tokens: {encoded_length}")
    print(f"Compression ratio: {compression_ratio:.2f}x")
    print(f"Vocabulary size: {len(bpe.vocab)}")
    
    # Example encoding
    sample_text = "నమస్కారం"  # "Hello" in Telugu
    encoded = bpe.encode(bpe.preprocess_telugu_text(sample_text))
    print(f"\nExample encoding:")
    print(f"Sample text: {sample_text}")
    print(f"Encoded text: {encoded}")

if __name__ == "__main__":
    main()