Spaces:
Sleeping
Sleeping
import gradio as gr | |
from huggingface_hub import snapshot_download | |
from hindi_bpe import HindiBPE, preprocess_hindi_text | |
import pandas as pd | |
import plotly.express as px | |
import os | |
# Download tokenizer if not exists | |
if not os.path.exists("tokenizer"): | |
snapshot_download( | |
repo_id="aayushraina/bpe-hindi", | |
local_dir="tokenizer", | |
allow_patterns=["*.json"] | |
) | |
class TokenizerDemo: | |
def __init__(self): | |
self.tokenizer = HindiBPE.load_tokenizer("tokenizer") | |
def tokenize_text(self, text: str) -> tuple: | |
"""Tokenize text and return visualization""" | |
if not text: | |
return "", None, "Please enter some text" | |
# Preprocess | |
text = preprocess_hindi_text(text) | |
# Tokenize | |
tokens = self.tokenizer.encode(text) | |
# Create visualization | |
token_df = pd.DataFrame({ | |
'Token': tokens, | |
'Length': [len(token) for token in tokens] | |
}) | |
fig = px.scatter(token_df, | |
x=range(len(tokens)), | |
y='Length', | |
hover_data=['Token'], | |
title='Token Lengths in Sequence') | |
# Calculate statistics | |
stats = { | |
'Total Tokens': len(tokens), | |
'Unique Tokens': len(set(tokens)), | |
'Average Token Length': sum(len(t) for t in tokens) / len(tokens), | |
'Compression Ratio': len(text) / sum(len(t) for t in tokens) | |
} | |
stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}" | |
for k, v in stats.items()) | |
return ( | |
" ".join(tokens), # Tokenized text | |
fig, # Visualization | |
stats_str # Statistics | |
) | |
def decode_tokens(self, tokens_text: str) -> str: | |
"""Decode space-separated tokens back to text""" | |
if not tokens_text: | |
return "Please tokenize some text first" | |
tokens = tokens_text.split() | |
return self.tokenizer.decode(tokens) | |
# Create Gradio interface | |
demo = TokenizerDemo() | |
interface = gr.Blocks(title="Hindi BPE Tokenizer") | |
with interface: | |
gr.Markdown(""" | |
# Hindi BPE Tokenizer Demo | |
This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text. | |
Enter Hindi text to see how it gets tokenized and analyze the token distribution. | |
[View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi) | |
""") | |
with gr.Row(): | |
with gr.Column(): | |
input_text = gr.Textbox( | |
label="Input Hindi Text", | |
placeholder="हिंदी में टेक्स्ट दर्ज करें...", | |
lines=5 | |
) | |
tokenize_btn = gr.Button("Tokenize") | |
with gr.Column(): | |
tokens_output = gr.Textbox( | |
label="Tokenized Output", | |
lines=5 | |
) | |
decode_btn = gr.Button("Decode") | |
original_output = gr.Textbox( | |
label="Decoded Text", | |
lines=5 | |
) | |
stats_output = gr.Textbox( | |
label="Tokenization Statistics", | |
lines=4 | |
) | |
plot_output = gr.Plot( | |
label="Token Length Distribution" | |
) | |
# Set up event handlers | |
tokenize_btn.click( | |
fn=demo.tokenize_text, | |
inputs=input_text, | |
outputs=[tokens_output, plot_output, stats_output] | |
) | |
decode_btn.click( | |
fn=demo.decode_tokens, | |
inputs=tokens_output, | |
outputs=original_output | |
) | |
# Add examples | |
gr.Examples( | |
examples=[ | |
["हिंदी भाषा बहुत सुंदर है।"], | |
["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"], | |
["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"] | |
], | |
inputs=input_text | |
) | |
# Launch the interface | |
interface.launch() |