Spaces:

aayushraina
/

bye-hindi

Sleeping

App Files Files Community

aayushraina commited on Jan 11

Commit

46759b2

verified ·

1 Parent(s): a87bfa8

Upload 13 files

Browse files

Files changed (13) hide show

.gitattributes +1 -2
README.md +114 -9
app.py +136 -0
app_gradio.py +123 -0
byte_pair_encoder.py +222 -0
data_loader.py +114 -0
download_dataset.py +184 -0
hindi_bpe.py +129 -0
requirements.txt +11 -0
test_hindi_bpe.py +148 -0
tokenizer/encoder.json +3 -0
tokenizer/vocab_stats.json +3 -0
tokenizer_config.json +3 -0

.gitattributes CHANGED Viewed

@@ -23,9 +23,7 @@
 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
@@ -33,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pth filter=lfs diff=lfs merge=lfs -text
 *.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.tar.* filter=lfs diff=lfs merge=lfs -text
 *.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
 *.wasm filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,118 @@
 ---
-title: Bye Hindi
-emoji: 🏢
-colorFrom: purple
-colorTo: blue
-sdk: gradio
-sdk_version: 5.12.0
-app_file: app.py
-pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+language: hi
+tags:
+- hindi
+- tokenizer
+- bpe
+- subword
+- text-processing
+pipeline_tag: text2text-generation
+inference: true
 license: mit
+spaces:
+- aayushraina/bpe-hindi
 ---
+# Hindi Byte Pair Encoding (BPE) Tokenizer
+A specialized BPE tokenizer for Hindi text that achieves efficient compression while maintaining linguistic coherence.
+## Online Demo
+Try the tokenizer in your browser: [Hindi BPE Tokenizer Demo](https://huggingface.co/spaces/aayushraina/bpe-hindi)
+## Project Overview
+This project implements a Byte Pair Encoding (BPE) tokenizer specifically designed for Hindi text. It features:
+- Efficient trie-based tokenization
+- Visualization of training progress
+- Compression ratio optimization
+- Support for large Hindi text datasets
+- Hugging Face compatibility
+## Project Structure
+hindi-bpe/
+├── data/ # Dataset directory
+│ ├── train/ # Training data
+│ └── valid/ # Validation data
+├── tokenizer/ # Saved tokenizer files
+│ ├── encoder.json # Encoder state
+│ └── vocab_stats.json # Vocabulary statistics
+├── output/ # Visualization outputs
+├── byte_pair_encoder.py # Core BPE implementation
+├── hindi_bpe.py # Hindi-specific wrapper
+├── test_hindi_bpe.py # Test suite
+└── requirements.txt # Dependencies
+## Training stats
+    - Iteration 4500:
+    - Vocabulary size: 4,477
+    - Data size: 448,754
+    - Compression ratio: 3.66
+    - Max token length: 64
+## File Descriptions
+1. **byte_pair_encoder.py**
+   - Core BPE implementation
+   - Trie-based tokenization
+   - Training statistics tracking
+   - Visualization utilities
+2. **hindi_bpe.py**
+   - Hindi-specific tokenizer wrapper
+   - Text preprocessing
+   - Model saving/loading
+   - Compression ratio calculation
+3. **app.py**
+   - Interactive web interface
+   - Real-time tokenization
+   - Training visualization
+   - Model parameter tuning
+4. **test_hindi_bpe.py**
+   - Test suite for tokenizer
+   - Performance benchmarks
+   - Example usage
+## Installation
+    - bash
+    - Clone repository
+    - git clone https://github.com/yourusername/hindi-bpe.git
+    - cd hindi-bpe
+    - pip install -r requirements.txt
+## Download and prepare dataset
+    - python download_dataset.py
+### Web Interface
+    - streamlit run app.py
+### Test-
+    - python test_hindi_bpe.py
+    - The test suite includes:
+    - Training pipeline verification
+    - Compression ratio validation
+    - Token count requirements
+    - Encoding/decoding accuracy
+## Performance Metrics
+    The tokenizer aims to achieve:
+    - Vocabulary size < 5000 tokens
+    - Compression ratio ≥ 3.2
+    - Fast encoding/decoding
+    - Memory-efficient operation
+## Contributing
+1. Fork the repository
+2. Create feature branch
+3. Commit changes
+4. Push to branch
+5. Create Pull Request
+## License
+This project is licensed under the MIT License - see the LICENSE file for details.

app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import gradio as gr
+from huggingface_hub import snapshot_download
+from hindi_bpe import HindiBPE, preprocess_hindi_text
+import pandas as pd
+import plotly.express as px
+import os
+# Download tokenizer if not exists
+if not os.path.exists("tokenizer"):
+    snapshot_download(
+        repo_id="aayushraina/bpe-hindi",
+        local_dir="tokenizer",
+        allow_patterns=["*.json"]
+    )
+class TokenizerDemo:
+    def __init__(self):
+        self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
+    def tokenize_text(self, text: str) -> tuple:
+        """Tokenize text and return visualization"""
+        if not text:
+            return "", None, "Please enter some text"
+        # Preprocess
+        text = preprocess_hindi_text(text)
+        # Tokenize
+        tokens = self.tokenizer.encode(text)
+        # Create visualization
+        token_df = pd.DataFrame({
+            'Token': tokens,
+            'Length': [len(token) for token in tokens]
+        })
+        fig = px.scatter(token_df,
+                        x=range(len(tokens)),
+                        y='Length',
+                        hover_data=['Token'],
+                        title='Token Lengths in Sequence')
+        # Calculate statistics
+        stats = {
+            'Total Tokens': len(tokens),
+            'Unique Tokens': len(set(tokens)),
+            'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
+            'Compression Ratio': len(text) / sum(len(t) for t in tokens)
+        }
+        stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
+                             for k, v in stats.items())
+        return (
+            " ".join(tokens),  # Tokenized text
+            fig,              # Visualization
+            stats_str        # Statistics
+        )
+    def decode_tokens(self, tokens_text: str) -> str:
+        """Decode space-separated tokens back to text"""
+        if not tokens_text:
+            return "Please tokenize some text first"
+        tokens = tokens_text.split()
+        return self.tokenizer.decode(tokens)
+# Create Gradio interface
+demo = TokenizerDemo()
+interface = gr.Blocks(title="Hindi BPE Tokenizer")
+with interface:
+    gr.Markdown("""
+    # Hindi BPE Tokenizer Demo
+    This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
+    Enter Hindi text to see how it gets tokenized and analyze the token distribution.
+    [View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
+    """)
+    with gr.Row():
+        with gr.Column():
+            input_text = gr.Textbox(
+                label="Input Hindi Text",
+                placeholder="हिंदी में टेक्स्ट दर्ज करें...",
+                lines=5
+            )
+            tokenize_btn = gr.Button("Tokenize")
+        with gr.Column():
+            tokens_output = gr.Textbox(
+                label="Tokenized Output",
+                lines=5
+            )
+            decode_btn = gr.Button("Decode")
+    original_output = gr.Textbox(
+        label="Decoded Text",
+        lines=5
+    )
+    stats_output = gr.Textbox(
+        label="Tokenization Statistics",
+        lines=4
+    )
+    plot_output = gr.Plot(
+        label="Token Length Distribution"
+    )
+    # Set up event handlers
+    tokenize_btn.click(
+        fn=demo.tokenize_text,
+        inputs=input_text,
+        outputs=[tokens_output, plot_output, stats_output]
+    )
+    decode_btn.click(
+        fn=demo.decode_tokens,
+        inputs=tokens_output,
+        outputs=original_output
+    )
+    # Add examples
+    gr.Examples(
+        examples=[
+            ["हिंदी भाषा बहुत सुंदर है।"],
+            ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
+            ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
+        ],
+        inputs=input_text
+    )
+# Launch the interface
+interface.launch()

app_gradio.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+from hindi_bpe import HindiBPE, preprocess_hindi_text
+import pandas as pd
+import plotly.express as px
+import json
+class TokenizerDemo:
+    def __init__(self):
+        self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
+    def tokenize_text(self, text: str) -> tuple:
+        """Tokenize text and return visualization"""
+        # Preprocess
+        text = preprocess_hindi_text(text)
+        # Tokenize
+        tokens = self.tokenizer.encode(text)
+        # Create visualization
+        token_df = pd.DataFrame({
+            'Token': tokens,
+            'Length': [len(token) for token in tokens]
+        })
+        fig = px.scatter(token_df,
+                        x=range(len(tokens)),
+                        y='Length',
+                        hover_data=['Token'],
+                        title='Token Lengths in Sequence')
+        # Calculate statistics
+        stats = {
+            'Total Tokens': len(tokens),
+            'Unique Tokens': len(set(tokens)),
+            'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
+            'Compression Ratio': len(text) / sum(len(t) for t in tokens)
+        }
+        stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
+                             for k, v in stats.items())
+        return (
+            " ".join(tokens),  # Tokenized text
+            fig,              # Visualization
+            stats_str        # Statistics
+        )
+    def decode_tokens(self, tokens_text: str) -> str:
+        """Decode space-separated tokens back to text"""
+        tokens = tokens_text.split()
+        return self.tokenizer.decode(tokens)
+def create_demo() -> gr.Interface:
+    """Create Gradio interface"""
+    demo = TokenizerDemo()
+    with gr.Blocks(title="Hindi BPE Tokenizer") as interface:
+        gr.Markdown("""
+        # Hindi BPE Tokenizer Demo
+        This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
+        Enter Hindi text to see how it gets tokenized and analyze the token distribution.
+        """)
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(
+                    label="Input Hindi Text",
+                    placeholder="हिंदी में टेक्स्ट दर्ज करें...",
+                    lines=5
+                )
+                tokenize_btn = gr.Button("Tokenize")
+            with gr.Column():
+                tokens_output = gr.Textbox(
+                    label="Tokenized Output",
+                    lines=5
+                )
+                decode_btn = gr.Button("Decode")
+        original_output = gr.Textbox(
+            label="Decoded Text",
+            lines=5
+        )
+        stats_output = gr.Textbox(
+            label="Tokenization Statistics",
+            lines=4
+        )
+        plot_output = gr.Plot(
+            label="Token Length Distribution"
+        )
+        # Set up event handlers
+        tokenize_btn.click(
+            fn=demo.tokenize_text,
+            inputs=input_text,
+            outputs=[tokens_output, plot_output, stats_output]
+        )
+        decode_btn.click(
+            fn=demo.decode_tokens,
+            inputs=tokens_output,
+            outputs=original_output
+        )
+        # Add examples
+        gr.Examples(
+            examples=[
+                ["हिंदी भाषा बहुत सुंदर है।"],
+                ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
+                ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
+            ],
+            inputs=input_text
+        )
+    return interface
+# Create and launch the demo
+if __name__ == "__main__":
+    demo = create_demo()
+    demo.launch()

byte_pair_encoder.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from typing import List, Dict, Optional
+from tqdm import tqdm
+from collections import Counter
+from matplotlib import pyplot as plt
+import json
+from pathlib import Path
+class TrieNode:
+    """Node in the prefix tree (trie) for fast token matching"""
+    def __init__(self):
+        self.children = {}
+        self.is_token = False
+        self.token = None
+class BytePairEncoder:
+    def __init__(self, text: str):
+        # Initialize vocabulary from characters
+        self.chars = sorted(list(set(text)))
+        self.stoi = {ch: i for i, ch in enumerate(self.chars)}
+        self.itos = {i: ch for i, ch in enumerate(self.chars)}
+        # Initial encoding of text
+        self.data = [self.stoi[c] for c in text]
+        # Statistics tracking
+        self.stats = {
+            "vocab_sizes": [len(self.chars)],
+            "data_sizes": [len(self.data)],
+            "compression_ratios": [1.0],
+            "merge_counts": [],
+            "tokens_created": [],
+            "max_token_lengths": [1],
+        }
+        # Store original length for compression ratio
+        self.original_length = len(self.data)
+        self.max_token_length = 1
+    def get_digram_stats(self) -> Counter:
+        """Get digram counts"""
+        counts = Counter()
+        for pair in zip(self.data, self.data[1:]):
+            pair = (int(pair[0]), int(pair[1]))
+            counts[pair] += 1
+        return counts
+    def encode_to_vocab_size(self, target_vocab_size: int, plot_interval: Optional[int] = None,
+                           print_interval: int = 100) -> None:
+        """Train until reaching target vocabulary size"""
+        pbar = tqdm(total=target_vocab_size, desc="Training BPE", initial=len(self.chars))
+        iteration = 0
+        while len(self.itos) < target_vocab_size:
+            result = self._merge_step()
+            if result is None:
+                break
+            iteration += 1
+            pbar.update(1)
+            if print_interval and iteration % print_interval == 0:
+                self._print_progress(iteration)
+            if plot_interval and iteration % plot_interval == 0:
+                self.plot_statistics(iteration=iteration)
+        pbar.close()
+    def _merge_step(self):
+        """Perform one merge operation"""
+        stats = self.get_digram_stats()
+        if not stats:
+            return None
+        top_pair, count = max(stats.items(), key=lambda x: x[1])
+        new_token = self._add_token(top_pair)
+        self.data = self._replace_pairs(top_pair, new_token)
+        self._update_stats(count)
+        return new_token, count
+    def _add_token(self, pair: tuple) -> int:
+        """Add new token to vocabulary"""
+        token_str = self.itos[pair[0]] + self.itos[pair[1]]
+        token_id = len(self.itos)
+        self.stoi[token_str] = token_id
+        self.itos[token_id] = token_str
+        self.max_token_length = max(self.max_token_length, len(token_str))
+        return token_id
+    def _replace_pairs(self, pair: tuple, new_token: int) -> List[int]:
+        """Replace all occurrences of pair with new token"""
+        result = []
+        i = 0
+        while i < len(self.data):
+            if i < len(self.data) - 1 and self.data[i] == pair[0] and self.data[i + 1] == pair[1]:
+                result.append(new_token)
+                i += 2
+            else:
+                result.append(self.data[i])
+                i += 1
+        return result
+    def _update_stats(self, merge_count: int):
+        """Update training statistics"""
+        self.stats["vocab_sizes"].append(len(self.itos))
+        self.stats["data_sizes"].append(len(self.data))
+        compression = self.original_length / len(self.data)
+        self.stats["compression_ratios"].append(compression)
+        self.stats["merge_counts"].append(merge_count)
+        self.stats["tokens_created"].append(self.itos[len(self.itos)-1])
+        self.stats["max_token_lengths"].append(self.max_token_length)
+    def plot_statistics(self, iteration: Optional[int] = None):
+        """Plot training statistics"""
+        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))
+        # Plot training metrics
+        ax1.plot(self.stats["vocab_sizes"], self.stats["data_sizes"])
+        ax1.set_title("Vocabulary vs Dataset Size")
+        ax2.plot(self.stats["vocab_sizes"], self.stats["compression_ratios"])
+        ax2.set_title("Compression Ratio Progress")
+        if self.stats["merge_counts"]:
+            ax3.hist(self.stats["merge_counts"], bins=30)
+            ax3.set_title("Merge Counts Distribution")
+        if self.stats["tokens_created"]:
+            lengths = [len(t) for t in self.stats["tokens_created"]]
+            ax4.plot(range(len(lengths)), lengths)
+            ax4.set_title("Token Length Evolution")
+        plt.tight_layout()
+        plt.show()
+    def save_to_file(self, filepath: Path):
+        """Save encoder state"""
+        state = {
+            "chars": self.chars,
+            "stoi": self.stoi,
+            "max_token_length": self.max_token_length,
+            "stats": self.stats
+        }
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(state, f, ensure_ascii=False, indent=2)
+    @classmethod
+    def load_from_file(cls, filepath: Path):
+        """Load encoder state"""
+        with open(filepath, 'r', encoding='utf-8') as f:
+            state = json.load(f)
+        instance = cls("")  # Create empty instance
+        instance.chars = state["chars"]
+        instance.stoi = state["stoi"]
+        instance.itos = {int(i): s for s, i in state["stoi"].items()}
+        instance.max_token_length = state["max_token_length"]
+        instance.stats = state["stats"]
+        return instance
+    def _print_progress(self, iteration: int):
+        """Print training progress"""
+        print(f"\nIteration {iteration}:")
+        print(f"Vocabulary size: {len(self.itos):,}")
+        print(f"Data size: {len(self.data):,}")
+        print(f"Compression ratio: {self.stats['compression_ratios'][-1]:.2f}")
+        if self.stats["merge_counts"]:
+            last_merge = self.stats["merge_counts"][-1]
+            last_token = self.stats["tokens_created"][-1]
+            print(f"Last merge count: {last_merge:,}")
+            print(f"Last token created: '{last_token}'")
+        print(f"Max token length: {self.max_token_length}")
+class TokenizerInternal:
+    """Tokenizer using trained BPE model"""
+    def __init__(self, encoder: BytePairEncoder):
+        self.stoi = encoder.stoi
+        self.max_token_length = encoder.max_token_length
+        self._trie = self._build_trie()
+    def _build_trie(self) -> TrieNode:
+        """Build trie for efficient tokenization"""
+        root = TrieNode()
+        for token in self.stoi:
+            node = root
+            for char in token:
+                if char not in node.children:
+                    node.children[char] = TrieNode()
+                node = node.children[char]
+            node.is_token = True
+            node.token = token
+        return root
+    def tokenize(self, text: str) -> List[str]:
+        """Tokenize text using trie-based matching"""
+        tokens = []
+        pos = 0
+        while pos < len(text):
+            token = self._find_longest_token(text[pos:])
+            tokens.append(token)
+            pos += len(token)
+        return tokens
+    def _find_longest_token(self, text: str) -> str:
+        """Find longest matching token starting at current position"""
+        node = self._trie
+        longest = text[0]
+        current = ""
+        for char in text[:self.max_token_length]:
+            if char not in node.children:
+                break
+            current += char
+            node = node.children[char]
+            if node.is_token:
+                longest = node.token
+        return longest

data_loader.py ADDED Viewed

	@@ -0,0 +1,114 @@

+import pandas as pd
+from pathlib import Path
+from typing import List, Dict, Tuple
+import random
+def load_hindi_dataset(base_path: str = "data", split: str = "train", num_files: int = None) -> str:
+    """
+    Load Hindi text from dataset with train/validation split structure.
+    Args:
+        base_path: Base directory containing train and validation folders
+        split: Either 'train' or 'valid'
+        num_files: Number of files to load (None for all files)
+    """
+    base_dir = Path(base_path)
+    split_dir = base_dir / split / split
+    if not split_dir.exists():
+        raise FileNotFoundError(f"Directory not found: {split_dir}")
+    print(f"\nLoading Hindi dataset from {split_dir}")
+    # Get all txt files in the directory
+    txt_files = list(split_dir.glob("*.txt"))
+    if not txt_files:
+        raise FileNotFoundError(f"No txt files found in {split_dir}")
+    # Sort files by word count (assuming filenames contain word counts)
+    txt_files.sort(key=lambda x: int(x.stem))
+    # Sample files if num_files is specified
+    if num_files is not None:
+        if num_files < len(txt_files):
+            txt_files = random.sample(txt_files, num_files)
+    print(f"Found {len(txt_files)} files")
+    # Load and combine text from files
+    texts = []
+    total_chars = 0
+    total_words = 0
+    for idx, file_path in enumerate(txt_files, 1):
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                text = f.read().strip()
+                word_count = int(file_path.stem)  # Filename is word count
+                texts.append(text)
+                total_chars += len(text)
+                total_words += word_count
+                if idx % 10 == 0:
+                    print(f"Processed {idx}/{len(txt_files)} files. "
+                          f"Total characters: {total_chars:,}, "
+                          f"Total words: {total_words:,}")
+        except Exception as e:
+            print(f"Error reading file {file_path}: {e}")
+            continue
+    combined_text = "\n\n".join(texts)
+    print(f"\nDataset loading completed:")
+    print(f"Total files: {len(texts)}")
+    print(f"Total characters: {len(combined_text):,}")
+    print(f"Total words: {total_words:,}")
+    print(f"Average words per file: {total_words/len(texts):,.1f}")
+    return combined_text
+def get_dataset_stats(base_path: str = "data") -> Dict:
+    """Get statistics about the dataset."""
+    stats = {}
+    for split in ['train', 'valid']:
+        split_dir = Path(base_path) / split
+        if split_dir.exists():
+            txt_files = list(split_dir.glob("*.txt"))
+            word_counts = [int(f.stem) for f in txt_files]
+            stats[split] = {
+                'num_files': len(txt_files),
+                'total_words': sum(word_counts),
+                'min_words': min(word_counts) if word_counts else 0,
+                'max_words': max(word_counts) if word_counts else 0,
+                'avg_words': sum(word_counts)/len(word_counts) if word_counts else 0
+            }
+    return stats
+def load_train_valid_split(base_path: str = "data",
+                          train_files: int = None,
+                          valid_files: int = None) -> Tuple[str, str]:
+    """Load both train and validation splits."""
+    train_text = load_hindi_dataset(base_path, "train", train_files)
+    valid_text = load_hindi_dataset(base_path, "valid", valid_files)
+    return train_text, valid_text
+if __name__ == "__main__":
+    # Print dataset statistics
+    stats = get_dataset_stats()
+    print("\nDataset Statistics:")
+    print("-" * 50)
+    for split, split_stats in stats.items():
+        print(f"\n{split.upper()} Split:")
+        for key, value in split_stats.items():
+            if isinstance(value, (int, float)):
+                print(f"{key}: {value:,}")
+            else:
+                print(f"{key}: {value}")
+    # Load sample data
+    print("\nLoading sample data...")
+    train_text, valid_text = load_train_valid_split(train_files=5, valid_files=2)
+    print(f"\nSample train text (first 200 chars):\n{train_text[:200]}")
+    print(f"\nSample valid text (first 200 chars):\n{valid_text[:200]}")

download_dataset.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import kagglehub
+from pathlib import Path
+import shutil
+import pandas as pd
+import re
+import nltk
+from typing import List, Dict
+from tqdm import tqdm
+def count_hindi_words(text: str) -> int:
+    """Count words in Hindi text."""
+    words = text.strip().split()
+    hindi_words = [w for w in words if re.search(r'[\u0900-\u097F]', w)]
+    return len(hindi_words)
+def create_dataframe_from_files(downloaded_paths: List[str]) -> pd.DataFrame:
+    """Create a DataFrame from downloaded text files."""
+    print("\nCreating DataFrame from text files...")
+    data = []
+    for file_path in tqdm(downloaded_paths):
+        if file_path.endswith('.txt'):
+            try:
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    content = f.read().strip()
+                # Split into title and text (assuming first line is title)
+                lines = content.split('\n', 1)
+                title = lines[0].strip()
+                text = lines[1].strip() if len(lines) > 1 else ""
+                data.append({
+                    'title': title,
+                    'text': text,
+                    'word_count': count_hindi_words(content)
+                })
+            except Exception as e:
+                print(f"Error reading file {file_path}: {e}")
+                continue
+    df = pd.DataFrame(data)
+    print(f"Created DataFrame with {len(df)} articles")
+    return df
+def process_and_split_articles(df: pd.DataFrame,
+                             output_dir: Path,
+                             train_ratio: float = 0.8,
+                             min_words: int = 100,
+                             max_words: int = 5000) -> Dict[str, int]:
+    """Process articles and split them into files based on word count."""
+    # Create output directories
+    train_dir = output_dir / "train"
+    valid_dir = output_dir / "valid"
+    train_dir.mkdir(exist_ok=True)
+    valid_dir.mkdir(exist_ok=True)
+    stats = {'train': 0, 'valid': 0, 'skipped': 0}
+    print("\nProcessing articles...")
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        try:
+            # Skip if too short or too long
+            if row['word_count'] < min_words or row['word_count'] > max_words:
+                stats['skipped'] += 1
+                continue
+            # Combine title and text
+            full_text = f"{row['title']}\n\n{row['text']}"
+            # Decide split (train or valid)
+            is_train = pd.np.random.random() < train_ratio
+            output_dir = train_dir if is_train else valid_dir
+            # Save to file named by word count
+            file_path = output_dir / f"{row['word_count']}.txt"
+            suffix = 1
+            while file_path.exists():
+                file_path = output_dir / f"{row['word_count']}_{suffix}.txt"
+                suffix += 1
+            with open(file_path, 'w', encoding='utf-8') as f:
+                f.write(full_text)
+            if is_train:
+                stats['train'] += 1
+            else:
+                stats['valid'] += 1
+        except Exception as e:
+            print(f"Error processing article: {e}")
+            stats['skipped'] += 1
+            continue
+    return stats
+def download_hindi_wikipedia_dataset():
+    """Download and process Hindi Wikipedia dataset."""
+    print("Starting dataset download...")
+    try:
+        # Download the dataset using kagglehub
+        downloaded_paths = kagglehub.dataset_download(
+            "disisbig/hindi-wikipedia-articles-172k"
+        )
+        print("Dataset downloaded successfully!")
+        print("Downloaded files:", downloaded_paths)
+        # Create data directory
+        data_dir = Path("data")
+        data_dir.mkdir(exist_ok=True)
+        # Create DataFrame from downloaded files
+        df = create_dataframe_from_files(downloaded_paths)
+        # Save DataFrame for future use
+        df.to_parquet(data_dir / "articles.parquet")
+        print(f"Saved DataFrame to {data_dir / 'articles.parquet'}")
+        # Process and split the articles
+        stats = process_and_split_articles(df, data_dir)
+        # Print statistics
+        print("\nProcessing completed:")
+        print(f"Train files: {stats['train']}")
+        print(f"Validation files: {stats['valid']}")
+        print(f"Skipped articles: {stats['skipped']}")
+        # Get file sizes
+        train_size = sum(f.stat().st_size for f in (data_dir / "train").glob("*.txt"))
+        valid_size = sum(f.stat().st_size for f in (data_dir / "valid").glob("*.txt"))
+        print(f"\nTotal size:")
+        print(f"Train: {train_size / (1024*1024):.2f} MB")
+        print(f"Validation: {valid_size / (1024*1024):.2f} MB")
+        return True
+    except Exception as e:
+        print(f"Error downloading/processing dataset: {e}")
+        return False
+def verify_dataset_structure():
+    """Verify the dataset directory structure and files."""
+    data_dir = Path("data")
+    if not data_dir.exists():
+        print("Error: Data directory not found!")
+        return False
+    # Check if we have the processed DataFrame
+    parquet_file = data_dir / "articles.parquet"
+    if parquet_file.exists():
+        df = pd.read_parquet(parquet_file)
+        print(f"\nArticles DataFrame:")
+        print(f"Total articles: {len(df)}")
+        # print(f"Word count range: {df['word_count'].min()} - {df['word_count'].max()}")
+    for split in ['train', 'valid']:
+        split_dir = data_dir / split
+        if not split_dir.exists():
+            print(f"Error: {split} directory not found!")
+            return False
+        txt_files = list(split_dir.glob("*.txt"))
+        if not txt_files:
+            print(f"Error: No text files found in {split} directory!")
+            return False
+        print(f"\n{split.upper()} split:")
+        print(f"Number of files: {len(txt_files)}")
+        word_counts = [int(f.stem.split('_')[0]) for f in txt_files]
+        print(f"Word count range: {min(word_counts)} - {max(word_counts)}")
+    return True
+if __name__ == "__main__":
+    # Download and process the dataset
+    success = download_hindi_wikipedia_dataset()
+    if success:
+        print("\nVerifying dataset structure...")
+        verify_dataset_structure()

hindi_bpe.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import re
+from collections import Counter
+from typing import Dict, List, Tuple, Set
+import unicodedata
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+from tqdm import tqdm
+import json
+from matplotlib import pyplot as plt
+from pathlib import Path
+from byte_pair_encoder import BytePairEncoder, TokenizerInternal
+class HindiBPE:
+    def __init__(self, vocab_size: int = 5000):
+        print(f"\nInitializing HindiBPE with max vocab size: {vocab_size}")
+        self.vocab_size = vocab_size
+        self.encoder = None
+    def train(self, text: str) -> None:
+        """Train BPE on Hindi text."""
+        print("\nInitializing BytePairEncoder...")
+        self.encoder = BytePairEncoder(text)
+        print("\nTraining BPE...")
+        self.encoder.encode_to_vocab_size(
+            target_vocab_size=self.vocab_size,
+            plot_interval=1000,
+            print_interval=100
+        )
+        # Plot final statistics
+        self.encoder.plot_statistics()
+        # Save the trained model
+        self.save_tokenizer()
+    def encode(self, text: str) -> List[str]:
+        """Encode Hindi text using trained tokenizer."""
+        if self.encoder is None:
+            raise ValueError("Tokenizer not trained yet!")
+        print("\nTokenizing text...")
+        tokenizer = TokenizerInternal(self.encoder)
+        tokens = list(tokenizer.tokenize(text))
+        compression = self.calculate_compression_ratio(text, tokens)
+        print(f"\nEncoding completed:")
+        print(f"Token count: {len(tokens)}")
+        print(f"Unique tokens: {len(set(tokens))}")
+        print(f"Compression ratio: {compression:.2f}")
+        return tokens
+    def decode(self, tokens: List[str]) -> str:
+        """Decode tokens back to text."""
+        if self.encoder is None:
+            raise ValueError("Tokenizer not trained yet!")
+        print("\nDecoding tokens...")
+        decoded = "".join(tokens)
+        print(f"Decoded length: {len(decoded)} characters")
+        return decoded
+    def save_tokenizer(self, path: str = "tokenizer") -> None:
+        """Save the tokenizer to disk."""
+        save_dir = Path(path)
+        save_dir.mkdir(exist_ok=True)
+        # Save the encoder
+        self.encoder.save_to_file(save_dir / "encoder.json")
+        # Save vocabulary stats
+        stats = self.get_token_statistics()
+        with open(save_dir / "vocab_stats.json", "w") as f:
+            json.dump(stats, f, indent=2)
+        print(f"Tokenizer saved to {save_dir}")
+    @classmethod
+    def load_tokenizer(cls, path: str = "tokenizer") -> "HindiBPE":
+        """Load a trained tokenizer from disk."""
+        load_dir = Path(path)
+        if not load_dir.exists():
+            raise FileNotFoundError(f"Tokenizer directory not found: {load_dir}")
+        # Create instance
+        instance = cls()
+        # Load encoder
+        instance.encoder = BytePairEncoder.load_from_file(load_dir / "encoder.json")
+        print(f"Loaded tokenizer from {load_dir}")
+        print(f"Vocabulary size: {len(instance.encoder.itos)}")
+        return instance
+    def get_token_statistics(self) -> Dict:
+        """Get statistics about the learned tokens."""
+        if self.encoder is None:
+            raise ValueError("Tokenizer not trained yet!")
+        token_lengths = [len(token) for token in self.encoder.itos.values()]
+        return {
+            'vocab_size': len(self.encoder.itos),
+            'avg_token_length': sum(token_lengths) / len(token_lengths),
+            'min_token_length': min(token_lengths),
+            'max_token_length': max(token_lengths),
+            'length_distribution': Counter(token_lengths),
+            'training_stats': self.encoder.stats
+        }
+    def calculate_compression_ratio(self, text: str, tokens: List[str]) -> float:
+        """Calculate compression ratio."""
+        original_size = len(text)
+        encoded_size = sum(len(token) for token in tokens)
+        return original_size / encoded_size
+def preprocess_hindi_text(text: str) -> str:
+    """Preprocess Hindi text for better BPE training."""
+    # Remove excessive whitespace
+    text = re.sub(r'\s+', ' ', text.strip())
+    # Normalize Unicode characters
+    text = unicodedata.normalize('NFKC', text)
+    # Remove unnecessary punctuation (keep essential ones)
+    text = re.sub(r'[^\u0900-\u097F\s।]', '', text)
+    return text

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+numpy==1.23.5
+pandas==1.5.3
+plotly==5.13.0
+kagglehub
+streamlit
+beautifulsoup4
+huggingface-hub>=0.19.0
+tqdm
+matplotlib
+gitpython>=3.1.0
+gradio>=4.0.0

test_hindi_bpe.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import unittest
+from pathlib import Path
+from hindi_bpe import HindiBPE, preprocess_hindi_text
+from data_loader import load_hindi_dataset
+import pandas as pd
+import plotly.express as px
+from typing import List, Dict
+import time
+class HindiBPETest:
+    def __init__(self, vocab_size: int = 4500, num_articles: int = 1000):
+        self.vocab_size = vocab_size
+        self.num_articles = num_articles
+        self.bpe = None
+        self.text = None
+        self.encoded = None
+        self.stats = {}
+    def load_data(self) -> str:
+        """Load and preprocess the dataset."""
+        print("\nStep 1: Loading dataset...")
+        start_time = time.time()
+        # Load train split
+        self.text = load_hindi_dataset(
+            split="train",
+            num_files=self.num_articles
+        )
+        self.text = preprocess_hindi_text(self.text)
+        # Get validation text for testing
+        self.valid_text = load_hindi_dataset(
+            split="valid",
+            num_files=min(self.num_articles // 5, 100)  # 20% of train size or max 100
+        )
+        self.stats['load_time'] = time.time() - start_time
+        self.stats['original_length'] = len(self.text)
+        self.stats['valid_length'] = len(self.valid_text)
+        print(f"Loading completed in {self.stats['load_time']:.2f} seconds")
+        return self.text
+    def train_tokenizer(self) -> HindiBPE:
+        """Train the BPE tokenizer."""
+        print("\nStep 2: Training BPE tokenizer...")
+        start_time = time.time()
+        self.bpe = HindiBPE(vocab_size=self.vocab_size)
+        self.bpe.train(self.text)
+        self.stats['train_time'] = time.time() - start_time
+        self.stats['vocab_size'] = len(self.bpe.vocab)
+        print(f"Training completed in {self.stats['train_time']:.2f} seconds")
+        return self.bpe
+    def encode_text(self) -> List[str]:
+        """Encode the text using trained tokenizer."""
+        print("\nStep 3: Encoding text...")
+        start_time = time.time()
+        self.encoded = self.bpe.encode(self.text)
+        self.stats['encode_time'] = time.time() - start_time
+        self.stats['encoded_length'] = sum(len(token) for token in self.encoded)
+        self.stats['compression_ratio'] = self.stats['original_length'] / self.stats['encoded_length']
+        print(f"Encoding completed in {self.stats['encode_time']:.2f} seconds")
+        return self.encoded
+    def save_visualizations(self, output_dir: str = "output"):
+        """Generate and save visualizations."""
+        print("\nStep 4: Generating visualizations...")
+        output_dir = Path(output_dir)
+        output_dir.mkdir(exist_ok=True)
+        # Token length distribution
+        token_lengths = [len(token) for token in self.bpe.vocab]
+        df = pd.DataFrame({'Length': token_lengths})
+        fig = px.histogram(df, x='Length',
+                          title='Token Length Distribution',
+                          labels={'Length': 'Token Length', 'count': 'Frequency'})
+        fig.write_html(output_dir / "token_distribution.html")
+        # Compression visualization
+        comp_df = pd.DataFrame({
+            'Stage': ['Original', 'Encoded'],
+            'Size': [self.stats['original_length'], self.stats['encoded_length']]
+        })
+        fig = px.bar(comp_df, x='Stage', y='Size',
+                    title='Text Compression Comparison')
+        fig.write_html(output_dir / "compression.html")
+        # Save statistics to CSV
+        pd.DataFrame([self.stats]).to_csv(output_dir / "stats.csv")
+        print(f"Visualizations saved to {output_dir}")
+    def print_summary(self):
+        """Print summary of the tokenization process."""
+        print("\nTokenization Summary:")
+        print("-" * 50)
+        print(f"Dataset size: {self.stats['original_length']:,} characters")
+        print(f"Vocabulary size: {self.stats['vocab_size']:,} tokens")
+        print(f"Compression ratio: {self.stats['compression_ratio']:.2f}")
+        print(f"\nProcessing times:")
+        print(f"Loading: {self.stats['load_time']:.2f} seconds")
+        print(f"Training: {self.stats['train_time']:.2f} seconds")
+        print(f"Encoding: {self.stats['encode_time']:.2f} seconds")
+    def run_full_pipeline(self) -> Dict:
+        """Run the complete tokenization pipeline."""
+        self.load_data()
+        self.train_tokenizer()
+        self.encode_text()
+        self.save_visualizations()
+        self.print_summary()
+        return self.stats
+def main():
+    # Example usage
+    test = HindiBPETest(vocab_size=4500, num_articles=1000)
+    stats = test.run_full_pipeline()
+    # Test tokenization on a sample text
+    sample_text = """
+    भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।
+    हिंदी भारत की प्रमुख भाषाओं में से एक है।
+    """
+    print("\nTesting tokenization on sample text:")
+    tokens = test.bpe.encode(sample_text)
+    print(f"Original text: {sample_text}")
+    print(f"Tokens: {tokens}")
+    decoded = test.bpe.decode(tokens)
+    print(f"Decoded text: {decoded}")
+    # Verify compression ratio requirement
+    if stats['compression_ratio'] >= 3.2:
+        print("\nSuccess: Achieved required compression ratio ≥ 3.2")
+    else:
+        print("\nWarning: Compression ratio below target 3.2")
+    # Verify vocabulary size requirement
+    if stats['vocab_size'] < 5000:
+        print("Success: Vocabulary size within limit < 5000")
+    else:
+        print("Warning: Vocabulary size exceeds limit")
+if __name__ == "__main__":
+    main()

tokenizer/encoder.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d559243e553c94c34426987f1f4411554cc5ece15bce3f30d53668ec876f39dc
+size 540153

tokenizer/vocab_stats.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f30c70f6c7adee48d16f6796cf43b321b89f3927efffd1d16c6cff19b343f74d
+size 474148

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f062eed13569bd463ce96289411c435a2f328ea07a106946dbe9274489a1bc6
+size 359