Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
import re | |
from huggingface_hub import from_pretrained_keras | |
model = from_pretrained_keras("vrclc/transliteration") | |
# Define source and target tokenizers (replace with your actual tokenizers) | |
source_tokens = list('abcdefghijklmnopqrstuvwxyz ') | |
source_tokenizer = Tokenizer(char_level=True, filters='') | |
source_tokenizer.fit_on_texts(source_tokens) | |
malayalam_tokens = [ | |
# Independent vowels | |
'അ', 'ആ', 'ഇ', 'ഈ', 'ഉ', 'ഊ', 'ഋ', 'ൠ', 'ഌ', 'ൡ', 'എ', 'ഏ', 'ഐ', 'ഒ', 'ഓ', 'ഔ', | |
# Consonants | |
'ക', 'ഖ', 'ഗ', 'ഘ', 'ങ', 'ച', 'ഛ', 'ജ', 'ഝ', 'ഞ', | |
'ട', 'ഠ', 'ഡ', 'ഢ', 'ണ', 'ത', 'ഥ', 'ദ', 'ധ', 'ന', | |
'പ', 'ഫ', 'ബ', 'ഭ', 'മ', 'യ', 'ര', 'ല', 'വ', 'ശ', | |
'ഷ', 'സ', 'ഹ', 'ള', 'ഴ', 'റ', | |
# Chillu letters | |
'ൺ', 'ൻ', 'ർ', 'ൽ', 'ൾ', | |
# Additional characters | |
'ം', 'ഃ', '്', | |
# Vowel modifiers / Signs | |
'ാ', 'ി', 'ീ', 'ു', 'ൂ', 'ൃ', 'ൄ', 'െ', 'േ', 'ൈ', 'ൊ', 'ോ', 'ൌ', 'ൗ', ' ' | |
] | |
# Create tokenizer for Malayalam tokens | |
target_tokenizer = Tokenizer(char_level=True, filters='') | |
target_tokenizer.fit_on_texts(malayalam_tokens) | |
# Load your pre-trained model | |
max_seq_length = model.get_layer("encoder_input").input_shape[0][1] | |
def transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length): | |
""" | |
Transliterates input text, preserving non-token characters. | |
""" | |
# Regular expression to split the text into tokens and non-tokens | |
tokens_and_non_tokens = re.findall(r"([a-zA-Z]+)|([^a-zA-Z]+)", input_text) | |
transliterated_text = "" | |
for token_or_non_token in tokens_and_non_tokens: | |
token = token_or_non_token[0] | |
non_token = token_or_non_token[1] | |
if token: | |
input_sequence = source_tokenizer.texts_to_sequences([token])[0] | |
input_sequence_padded = pad_sequences([input_sequence], maxlen=max_seq_length, padding='post') | |
predicted_sequence = model.predict(input_sequence_padded) | |
predicted_indices = np.argmax(predicted_sequence, axis=-1)[0] | |
transliterated_word = ''.join([target_tokenizer.index_word[idx] for idx in predicted_indices if idx != 0]) | |
transliterated_text += transliterated_word | |
elif non_token: | |
transliterated_text += non_token | |
return transliterated_text | |
def transliterate(input_text): | |
return transliterate_with_split_tokens(input_text, model, source_tokenizer, target_tokenizer, max_seq_length) | |
# Create Gradio interface with enhanced features | |
def create_transliteration_interface(): | |
# Define input and output components with more details | |
input_textbox = gr.Textbox( | |
lines=3, | |
placeholder="Enter Manglish text to transliterate to Malayalam...", | |
label="Input Text" | |
) | |
output_textbox = gr.Textbox( | |
lines=3, | |
label="Transliterated Malayalam Text" | |
) | |
# Create the Gradio interface with more comprehensive configuration | |
interface = gr.Interface( | |
fn=transliterate, | |
inputs=[ | |
gr.Textbox( | |
lines=3, | |
placeholder="Enter English text to transliterate to Malayalam...", | |
label="Input Text" | |
) | |
], | |
outputs=[ | |
gr.Textbox( | |
lines=3, | |
label="Transliterated Malayalam Text" | |
) | |
], | |
title="🌟 English to Malayalam Transliterator", | |
description="Transliterate Manglish (Romanised Malayalam) text to Malayalam characters. Simply type or paste your Manglish text, and see the Malayalam transliteration instantly!", | |
article="## How to Use\n1. Enter Manglish text in the input box\n2. The transliteration will appear after you SUBMIT\n3. Works with words, phrases, and sentences", | |
examples=[ | |
["ente veed"], | |
["malayalam padikkano ? 😃"], | |
["india ente rajyamanu"] | |
], | |
cache_examples=False, | |
theme="huggingface" | |
) | |
return interface | |
# Launch the Gradio interface | |
if __name__ == "__main__": | |
iface = create_transliteration_interface() | |
iface.launch() |