File size: 4,344 Bytes
238ef35
 
fb302fa
 
f18bb84
 
 
 
 
 
 
 
 
 
 
238ef35
 
 
 
 
fb302fa
 
 
238ef35
fb302fa
 
238ef35
fb302fa
f18bb84
fb302fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238ef35
 
fb302fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238ef35
 
 
 
fb302fa
 
 
 
 
238ef35
fb302fa
238ef35
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gradio as gr
from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
from tensorflow.keras.models import load_model
import pickle
import json
import keras

custom_objects = {
    'LSTM': keras.layers.LSTM,
}
# Load model
model_lstm = load_model('seq2seq_model.h5', custom_objects=custom_objects, compile=False)

# Recompile the model
model_lstm.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Define the model repository and tokenizer checkpoint
model_checkpoint = "himanishprak23/neural_machine_translation"
tokenizer_checkpoint = "Helsinki-NLP/opus-mt-en-hi"

tokenizer_base_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model_base_nmt = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")

# Load the tokenizer from Helsinki-NLP and model from Hugging Face repository
tokenizer_nmt = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
model_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Loading models, tokenizer & variables for trained LSTM translation model.
model_lstm = model_lstm
with open('eng_tokenizer.pkl', 'rb') as file:
    eng_tokenizer = pickle.load(file)
with open('hin_tokenizer.pkl', 'rb') as file:
    hin_tokenizer = pickle.load(file)
max_len_eng = 20
max_len_hin = 22

def translate_text_base_nmt(input_text):
    tokenized_input = tokenizer_base_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
    generated_tokens = model_base_nmt.generate(**tokenized_input, max_length=128)
    predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
    return predicted_text

def translate_text_nmt(input_text):
    tokenized_input = tokenizer_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
    generated_tokens = model_nmt.generate(**tokenized_input, max_length=128)
    predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
    return predicted_text

def translate_text_lstm(sentence, model, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin):
    # Tokenize and pad the input sentence
    input_seq = eng_tokenizer.texts_to_sequences([sentence])
    input_seq = pad_sequences(input_seq, maxlen=max_len_eng, padding='post')
    
    # Initialize target sequence with start token
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = hin_tokenizer.word_index['start']
    
    # Create reverse word index for Hindi
    reverse_word_index = dict([(idx, word) for word, idx in hin_tokenizer.word_index.items()])
    
    decoded_sentence = []
    
    for _ in range(max_len_hin):
        output = model.predict([input_seq, target_seq], verbose=0)
        sampled_token_index = np.argmax(output[0, -1, :])
        sampled_word = reverse_word_index.get(sampled_token_index, '')
        
        if sampled_word == 'end' or sampled_word == '' or len(decoded_sentence) >= max_len_hin - 1:
            break
        
        decoded_sentence.append(sampled_word)
        
        # Update target sequence
        target_seq = np.zeros((1, len(decoded_sentence) + 1))
        for t, word in enumerate(decoded_sentence):
            target_seq[0, t] = hin_tokenizer.word_index.get(word, 0)  # Use 0 for unknown words
        target_seq[0, len(decoded_sentence)] = sampled_token_index
    
    return ' '.join(decoded_sentence)


def translate_text(input_text):
    translation_lstm = translate_text_lstm(input_text, model_lstm, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin)
    translation_nmt_base = translate_text_base_nmt(input_text)
    translation_nmt_finetuned = translate_text_nmt(input_text)  
    return translation_lstm, translation_nmt_base, translation_nmt_finetuned

# Create the Gradio interface
iface = gr.Interface(
    fn=translate_text,
    inputs=gr.components.Textbox(lines=2, placeholder="Enter text to translate from English to Hindi..."),
    outputs=[
        gr.components.Textbox(label="Translation (LSTM Model)"),
        gr.components.Textbox(label="Translation (Base Helsinki Model)"),
        gr.components.Textbox(label="Translation (Fine-tuned Helsinki Model)")
    ],
    title="English to Hindi Translator",
    description="Enter English text and get the Hindi translation from three different models: LSTM, Base Helsinki-NLP, and Fine-tuned Helsinki-NLP."
)

# Launch the Gradio app
iface.launch()