bye-hindi / app.py
aayushraina's picture
Upload 13 files
46759b2 verified
import gradio as gr
from huggingface_hub import snapshot_download
from hindi_bpe import HindiBPE, preprocess_hindi_text
import pandas as pd
import plotly.express as px
import os
# Download tokenizer if not exists
if not os.path.exists("tokenizer"):
snapshot_download(
repo_id="aayushraina/bpe-hindi",
local_dir="tokenizer",
allow_patterns=["*.json"]
)
class TokenizerDemo:
def __init__(self):
self.tokenizer = HindiBPE.load_tokenizer("tokenizer")
def tokenize_text(self, text: str) -> tuple:
"""Tokenize text and return visualization"""
if not text:
return "", None, "Please enter some text"
# Preprocess
text = preprocess_hindi_text(text)
# Tokenize
tokens = self.tokenizer.encode(text)
# Create visualization
token_df = pd.DataFrame({
'Token': tokens,
'Length': [len(token) for token in tokens]
})
fig = px.scatter(token_df,
x=range(len(tokens)),
y='Length',
hover_data=['Token'],
title='Token Lengths in Sequence')
# Calculate statistics
stats = {
'Total Tokens': len(tokens),
'Unique Tokens': len(set(tokens)),
'Average Token Length': sum(len(t) for t in tokens) / len(tokens),
'Compression Ratio': len(text) / sum(len(t) for t in tokens)
}
stats_str = "\n".join(f"{k}: {v:.2f}" if isinstance(v, float) else f"{k}: {v}"
for k, v in stats.items())
return (
" ".join(tokens), # Tokenized text
fig, # Visualization
stats_str # Statistics
)
def decode_tokens(self, tokens_text: str) -> str:
"""Decode space-separated tokens back to text"""
if not tokens_text:
return "Please tokenize some text first"
tokens = tokens_text.split()
return self.tokenizer.decode(tokens)
# Create Gradio interface
demo = TokenizerDemo()
interface = gr.Blocks(title="Hindi BPE Tokenizer")
with interface:
gr.Markdown("""
# Hindi BPE Tokenizer Demo
This demo showcases a Byte Pair Encoding (BPE) tokenizer specifically trained for Hindi text.
Enter Hindi text to see how it gets tokenized and analyze the token distribution.
[View model on Hugging Face](https://huggingface.co/aayushraina/bpe-hindi)
""")
with gr.Row():
with gr.Column():
input_text = gr.Textbox(
label="Input Hindi Text",
placeholder="हिंदी में टेक्स्ट दर्ज करें...",
lines=5
)
tokenize_btn = gr.Button("Tokenize")
with gr.Column():
tokens_output = gr.Textbox(
label="Tokenized Output",
lines=5
)
decode_btn = gr.Button("Decode")
original_output = gr.Textbox(
label="Decoded Text",
lines=5
)
stats_output = gr.Textbox(
label="Tokenization Statistics",
lines=4
)
plot_output = gr.Plot(
label="Token Length Distribution"
)
# Set up event handlers
tokenize_btn.click(
fn=demo.tokenize_text,
inputs=input_text,
outputs=[tokens_output, plot_output, stats_output]
)
decode_btn.click(
fn=demo.decode_tokens,
inputs=tokens_output,
outputs=original_output
)
# Add examples
gr.Examples(
examples=[
["हिंदी भाषा बहुत सुंदर है।"],
["भारत एक विशाल देश है। यहाँ की संस्कृति बहुत पुरानी है।"],
["मैं हिंदी में प्रोग्रामिंग सीख रहा हूं।"]
],
inputs=input_text
)
# Launch the interface
interface.launch()