import re from collections import Counter from typing import Dict, List, Tuple, Set import unicodedata import pandas as pd import plotly.express as px import plotly.graph_objects as go from tqdm import tqdm import json from matplotlib import pyplot as plt from pathlib import Path from byte_pair_encoder import BytePairEncoder, TokenizerInternal class HindiBPE: def __init__(self, vocab_size: int = 5000): print(f"\nInitializing HindiBPE with max vocab size: {vocab_size}") self.vocab_size = vocab_size self.encoder = None def train(self, text: str) -> None: """Train BPE on Hindi text.""" print("\nInitializing BytePairEncoder...") self.encoder = BytePairEncoder(text) print("\nTraining BPE...") self.encoder.encode_to_vocab_size( target_vocab_size=self.vocab_size, plot_interval=1000, print_interval=100 ) # Plot final statistics self.encoder.plot_statistics() # Save the trained model self.save_tokenizer() def encode(self, text: str) -> List[str]: """Encode Hindi text using trained tokenizer.""" if self.encoder is None: raise ValueError("Tokenizer not trained yet!") print("\nTokenizing text...") tokenizer = TokenizerInternal(self.encoder) tokens = list(tokenizer.tokenize(text)) compression = self.calculate_compression_ratio(text, tokens) print(f"\nEncoding completed:") print(f"Token count: {len(tokens)}") print(f"Unique tokens: {len(set(tokens))}") print(f"Compression ratio: {compression:.2f}") return tokens def decode(self, tokens: List[str]) -> str: """Decode tokens back to text.""" if self.encoder is None: raise ValueError("Tokenizer not trained yet!") print("\nDecoding tokens...") decoded = "".join(tokens) print(f"Decoded length: {len(decoded)} characters") return decoded def save_tokenizer(self, path: str = "tokenizer") -> None: """Save the tokenizer to disk.""" save_dir = Path(path) save_dir.mkdir(exist_ok=True) # Save the encoder self.encoder.save_to_file(save_dir / "encoder.json") # Save vocabulary stats stats = self.get_token_statistics() with open(save_dir / "vocab_stats.json", "w") as f: json.dump(stats, f, indent=2) print(f"Tokenizer saved to {save_dir}") @classmethod def load_tokenizer(cls, path: str = "tokenizer") -> "HindiBPE": """Load a trained tokenizer from disk.""" load_dir = Path(path) if not load_dir.exists(): raise FileNotFoundError(f"Tokenizer directory not found: {load_dir}") # Create instance instance = cls() # Load encoder instance.encoder = BytePairEncoder.load_from_file(load_dir / "encoder.json") print(f"Loaded tokenizer from {load_dir}") print(f"Vocabulary size: {len(instance.encoder.itos)}") return instance def get_token_statistics(self) -> Dict: """Get statistics about the learned tokens.""" if self.encoder is None: raise ValueError("Tokenizer not trained yet!") token_lengths = [len(token) for token in self.encoder.itos.values()] return { 'vocab_size': len(self.encoder.itos), 'avg_token_length': sum(token_lengths) / len(token_lengths), 'min_token_length': min(token_lengths), 'max_token_length': max(token_lengths), 'length_distribution': Counter(token_lengths), 'training_stats': self.encoder.stats } def calculate_compression_ratio(self, text: str, tokens: List[str]) -> float: """Calculate compression ratio.""" original_size = len(text) encoded_size = sum(len(token) for token in tokens) return original_size / encoded_size def preprocess_hindi_text(text: str) -> str: """Preprocess Hindi text for better BPE training.""" # Remove excessive whitespace text = re.sub(r'\s+', ' ', text.strip()) # Normalize Unicode characters text = unicodedata.normalize('NFKC', text) # Remove unnecessary punctuation (keep essential ones) text = re.sub(r'[^\u0900-\u097F\są„¤]', '', text) return text