Spaces:
Sleeping
Sleeping
File size: 4,216 Bytes
46759b2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
import gradio as gr
from huggingface_hub import snapshot_download
from hindi_bpe import HindiBPE, preprocess_hindi_text
import pandas as pd
import plotly.express as px
import os
# Download tokenizer if not exists
if not os.path.exists("tokenizer"):
snapshot_download(
repo_id="aayushraina/bpe-hindi",
local_dir="tokenizer",
allow_patterns=["*.json"]
)
class TokenizerDemo:
def __init__(self):
self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
def tokenize_text(self, text: str) -> tuple:
"""Tokenize text and return visualization"""
if not text:
return "", None, "Please enter some text"
# Preprocess
text = preprocess_hindi_text(text)
# Tokenize
tokens = self.tokenizer.encode(text)
# Create visualization
token_df = pd.DataFrame({
'Token': tokens,
'Length': [len(token) for token in tokens]
})
fig = px.scatter(token_df,
x=range(len(tokens)),
y='Length',
hover_data=['Token'],
title='Token Lengths in Sequence')
# Calculate statistics
stats = {
'Total Tokens': len(tokens),
'Unique Tokens': len(set(tokens)),
'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
'Compression Ratio': len(text) / sum(len(t) for t in tokens)
}
stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
for k, v in stats.items())
return (
" ".join(tokens), # Tokenized text
fig, # Visualization
stats_str # Statistics
)
def decode_tokens(self, tokens_text: str) -> str:
"""Decode space-separated tokens back to text"""
if not tokens_text:
return "Please tokenize some text first"
tokens = tokens_text.split()
return self.tokenizer.decode(tokens)
# Create Gradio interface
demo = TokenizerDemo()
interface = gr.Blocks(title="Hindi BPE Tokenizer")
with interface:
gr.Markdown("""
# Hindi BPE Tokenizer Demo
This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
Enter Hindi text to see how it gets tokenized and analyze the token distribution.
[View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Hindi Text",
placeholder="हिंदी में टेक्स्ट दर्ज करें...",
lines=5
)
tokenize_btn = gr.Button("Tokenize")
with gr.Column():
tokens_output = gr.Textbox(
label="Tokenized Output",
lines=5
)
decode_btn = gr.Button("Decode")
original_output = gr.Textbox(
label="Decoded Text",
lines=5
)
stats_output = gr.Textbox(
label="Tokenization Statistics",
lines=4
)
plot_output = gr.Plot(
label="Token Length Distribution"
)
# Set up event handlers
tokenize_btn.click(
fn=demo.tokenize_text,
inputs=input_text,
outputs=[tokens_output, plot_output, stats_output]
)
decode_btn.click(
fn=demo.decode_tokens,
inputs=tokens_output,
outputs=original_output
)
# Add examples
gr.Examples(
examples=[
["हिंदी भाषा बहुत सुंदर है।"],
["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
],
inputs=input_text
)
# Launch the interface
interface.launch() |