File size: 4,216 Bytes
46759b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import gradio as gr
from huggingface_hub import snapshot_download
from hindi_bpe import HindiBPE, preprocess_hindi_text
import pandas as pd
import plotly.express as px
import os

# Download tokenizer if not exists
if not os.path.exists("tokenizer"):
    snapshot_download(
        repo_id="aayushraina/bpe-hindi",
        local_dir="tokenizer",
        allow_patterns=["*.json"]
    )

class TokenizerDemo:
    def __init__(self):
        self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
        
    def tokenize_text(self, text: str) -> tuple:
        """Tokenize text and return visualization"""
        if not text:
            return "", None, "Please enter some text"
            
        # Preprocess
        text = preprocess_hindi_text(text)
        
        # Tokenize
        tokens = self.tokenizer.encode(text)
        
        # Create visualization
        token_df = pd.DataFrame({
            'Token': tokens,
            'Length': [len(token) for token in tokens]
        })
        
        fig = px.scatter(token_df, 
                        x=range(len(tokens)), 
                        y='Length',
                        hover_data=['Token'],
                        title='Token Lengths in Sequence')
        
        # Calculate statistics
        stats = {
            'Total Tokens': len(tokens),
            'Unique Tokens': len(set(tokens)),
            'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
            'Compression Ratio': len(text) / sum(len(t) for t in tokens)
        }
        
        stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}" 
                             for k, v in stats.items())
        
        return (
            " ".join(tokens),  # Tokenized text
            fig,              # Visualization
            stats_str        # Statistics
        )
    
    def decode_tokens(self, tokens_text: str) -> str:
        """Decode space-separated tokens back to text"""
        if not tokens_text:
            return "Please tokenize some text first"
        tokens = tokens_text.split()
        return self.tokenizer.decode(tokens)

# Create Gradio interface
demo = TokenizerDemo()

interface = gr.Blocks(title="Hindi BPE Tokenizer")

with interface:
    gr.Markdown("""
    # Hindi BPE Tokenizer Demo
    
    This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
    Enter Hindi text to see how it gets tokenized and analyze the token distribution.
    
    [View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
    """)
    
    with gr.Row():
        with gr.Column():
            input_text = gr.Textbox(
                label="Input Hindi Text",
                placeholder="हिंदी में टेक्स्ट दर्ज करें...",
                lines=5
            )
            tokenize_btn = gr.Button("Tokenize")
        
        with gr.Column():
            tokens_output = gr.Textbox(
                label="Tokenized Output",
                lines=5
            )
            decode_btn = gr.Button("Decode")
            
    original_output = gr.Textbox(
        label="Decoded Text",
        lines=5
    )
    
    stats_output = gr.Textbox(
        label="Tokenization Statistics",
        lines=4
    )
    
    plot_output = gr.Plot(
        label="Token Length Distribution"
    )
    
    # Set up event handlers
    tokenize_btn.click(
        fn=demo.tokenize_text,
        inputs=input_text,
        outputs=[tokens_output, plot_output, stats_output]
    )
    
    decode_btn.click(
        fn=demo.decode_tokens,
        inputs=tokens_output,
        outputs=original_output
    )
    
    # Add examples
    gr.Examples(
        examples=[
            ["हिंदी भाषा बहुत सुंदर है।"],
            ["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
            ["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
        ],
        inputs=input_text
    )

# Launch the interface
interface.launch()