Spaces:
Sleeping
Sleeping
import gradio as gr | |
from telugu_bpe import TeluguBPE | |
import os | |
# Initialize the BPE model | |
bpe = TeluguBPE(vocab_size=5000) | |
# Get the absolute path to the model file | |
current_dir = os.path.dirname(os.path.abspath(__file__)) | |
model_path = os.path.join(current_dir, "telugu_bpe_model.json") | |
# Load the pre-trained model | |
try: | |
bpe.load_model(model_path) | |
print("Model loaded successfully!") | |
except FileNotFoundError: | |
print(f"Error: Model file not found at {model_path}") | |
# Train a small model with sample text if model doesn't exist | |
sample_text = """ | |
నమస్కారం తెలుగు భాష చాలా అందమైన భాష | |
తెలుగు భారతదేశంలోని ద్రావిడ భాషల్లో ఒకటి | |
తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి | |
""" | |
processed_text = bpe.preprocess_telugu_text(sample_text) | |
bpe.learn_bpe(processed_text) | |
bpe.save_model(model_path) | |
print("Created a new model with sample text") | |
def process_text(input_text: str) -> dict: | |
""" | |
Process input Telugu text and return tokenization results | |
""" | |
if not input_text or input_text.strip() == "": | |
return { | |
"Error": "Please enter some Telugu text" | |
} | |
try: | |
# Preprocess the input text | |
processed_text = bpe.preprocess_telugu_text(input_text) | |
# Encode the text | |
encoded_tokens = bpe.encode(processed_text) | |
# Calculate statistics | |
char_count = len(processed_text) | |
token_count = len(encoded_tokens) | |
compression_ratio = char_count / token_count if token_count > 0 else 0 | |
return { | |
"Preprocessed Text": processed_text, | |
"Tokens": encoded_tokens, | |
"Character Count": char_count, | |
"Token Count": token_count, | |
"Compression Ratio": f"{compression_ratio:.2f}x", | |
"Vocabulary Size": len(bpe.vocab) | |
} | |
except Exception as e: | |
return { | |
"Error": f"An error occurred: {str(e)}" | |
} | |
# Create Gradio interface | |
demo = gr.Interface( | |
fn=process_text, | |
inputs=[ | |
gr.Textbox( | |
lines=4, | |
placeholder="Enter Telugu text here...", | |
label="Input Telugu Text", | |
value="నమస్కారం" | |
) | |
], | |
outputs=gr.JSON(label="Tokenization Results"), | |
title="Telugu BPE Tokenizer", | |
description=""" | |
## Telugu Byte Pair Encoding (BPE) Tokenizer | |
This tokenizer is specifically designed for Telugu text processing with a vocabulary size of ~5000 tokens. | |
### Features: | |
- Telugu-specific preprocessing | |
- BPE tokenization | |
- Compression statistics | |
- Character and token counts | |
### How to use: | |
1. Enter Telugu text in the input box | |
2. Get tokenized output and statistics | |
### Example inputs provided below ⬇️ | |
""", | |
examples=[ | |
["నమస్కారం"], | |
["తెలుగు భాష చాలా అందమైన భాష"], | |
["నేను తెలుగులో మాట్లాడగలను"], | |
["తెలుగు అక్షరమాల లో 56 అక్షరాలు ఉన్నాయి"] | |
], | |
theme=gr.themes.Soft(), | |
allow_flagging="never", | |
cache_examples=True | |
) | |
# Launch configuration for Hugging Face Spaces | |
if __name__ == "__main__": | |
demo.launch( | |
server_name="0.0.0.0", | |
server_port=7860, | |
share=False | |
) |