himanishprak23 Kumarkishalaya commited on
Commit
fb302fa
·
verified ·
1 Parent(s): a8dfe6f

Update app.py (#2)

Browse files

- Update app.py (ef12feaa62ba7c828e831cd05ff3933e07666555)


Co-authored-by: Kumar Kishalaya <[email protected]>

Files changed (1) hide show
  1. app.py +71 -8
app.py CHANGED
@@ -1,27 +1,90 @@
1
  import gradio as gr
2
  from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
 
 
3
 
4
  # Define the model repository and tokenizer checkpoint
5
  model_checkpoint = "himanishprak23/neural_machine_translation"
6
  tokenizer_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
7
 
 
 
 
8
  # Load the tokenizer from Helsinki-NLP and model from Hugging Face repository
9
- tokenizer = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
10
- model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
11
 
12
- def translate_text(input_text):
13
- tokenized_input = tokenizer(input_text, return_tensors='tf', max_length=128, truncation=True)
14
- generated_tokens = model.generate(**tokenized_input, max_length=128)
15
- predicted_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  return predicted_text
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # Create the Gradio interface
19
  iface = gr.Interface(
20
  fn=translate_text,
21
  inputs=gr.components.Textbox(lines=2, placeholder="Enter text to translate from English to Hindi..."),
22
- outputs=gr.components.Textbox(),
 
 
 
 
23
  title="English to Hindi Translator",
24
- description="Enter English text and get the Hindi translation."
25
  )
26
 
27
  # Launch the Gradio app
 
1
  import gradio as gr
2
  from transformers import AutoTokenizer, TFAutoModelForSeq2SeqLM
3
+ from tensorflow.keras.models import load_model
4
+ import pickle
5
 
6
  # Define the model repository and tokenizer checkpoint
7
  model_checkpoint = "himanishprak23/neural_machine_translation"
8
  tokenizer_checkpoint = "Helsinki-NLP/opus-mt-en-hi"
9
 
10
+ tokenizer_base_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
11
+ model_base_nmt = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi")
12
+
13
  # Load the tokenizer from Helsinki-NLP and model from Hugging Face repository
14
+ tokenizer_nmt = AutoTokenizer.from_pretrained(tokenizer_checkpoint)
15
+ model_nmt = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
16
 
17
+ # Loading models, tokenizer & variables for trained LSTM translation model.
18
+ model_lstm = load_model('seq2seq_model.h5')
19
+ with open('eng_tokenizer.pkl', 'rb') as file:
20
+ eng_tokenizer = pickle.load(file)
21
+ with open('hin_tokenizer.pkl', 'rb') as file:
22
+ hin_tokenizer = pickle.load(file)
23
+ max_len_eng = 20
24
+ max_len_hin = 22
25
+
26
+ def translate_text_base_nmt(input_text):
27
+ tokenized_input = tokenizer_base_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
28
+ generated_tokens = model_base_nmt.generate(**tokenized_input, max_length=128)
29
+ predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
30
+ return predicted_text
31
+
32
+ def translate_text_nmt(input_text):
33
+ tokenized_input = tokenizer_nmt(input_text, return_tensors='tf', max_length=128, truncation=True)
34
+ generated_tokens = model_nmt.generate(**tokenized_input, max_length=128)
35
+ predicted_text = tokenizer_nmt.decode(generated_tokens[0], skip_special_tokens=True)
36
  return predicted_text
37
 
38
+ def translate_text_lstm(sentence, model, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin):
39
+ # Tokenize and pad the input sentence
40
+ input_seq = eng_tokenizer.texts_to_sequences([sentence])
41
+ input_seq = pad_sequences(input_seq, maxlen=max_len_eng, padding='post')
42
+
43
+ # Initialize target sequence with start token
44
+ target_seq = np.zeros((1, 1))
45
+ target_seq[0, 0] = hin_tokenizer.word_index['start']
46
+
47
+ # Create reverse word index for Hindi
48
+ reverse_word_index = dict([(idx, word) for word, idx in hin_tokenizer.word_index.items()])
49
+
50
+ decoded_sentence = []
51
+
52
+ for _ in range(max_len_hin):
53
+ output = model.predict([input_seq, target_seq], verbose=0)
54
+ sampled_token_index = np.argmax(output[0, -1, :])
55
+ sampled_word = reverse_word_index.get(sampled_token_index, '')
56
+
57
+ if sampled_word == 'end' or sampled_word == '' or len(decoded_sentence) >= max_len_hin - 1:
58
+ break
59
+
60
+ decoded_sentence.append(sampled_word)
61
+
62
+ # Update target sequence
63
+ target_seq = np.zeros((1, len(decoded_sentence) + 1))
64
+ for t, word in enumerate(decoded_sentence):
65
+ target_seq[0, t] = hin_tokenizer.word_index.get(word, 0) # Use 0 for unknown words
66
+ target_seq[0, len(decoded_sentence)] = sampled_token_index
67
+
68
+ return ' '.join(decoded_sentence)
69
+
70
+
71
+ def translate_text(input_text):
72
+ translation_lstm = translate_text_lstm(input_text, model_lstm, eng_tokenizer, hin_tokenizer, max_len_eng, max_len_hin)
73
+ translation_nmt_base = translate_text_base_nmt(input_text)
74
+ translation_nmt_finetuned = translate_text_nmt(input_text)
75
+ return translation_lstm, translation_nmt_base, translation_nmt_finetuned
76
+
77
  # Create the Gradio interface
78
  iface = gr.Interface(
79
  fn=translate_text,
80
  inputs=gr.components.Textbox(lines=2, placeholder="Enter text to translate from English to Hindi..."),
81
+ outputs=[
82
+ gr.components.Textbox(label="Translation (LSTM Model)"),
83
+ gr.components.Textbox(label="Translation (Base Helsinki Model)"),
84
+ gr.components.Textbox(label="Translation (Fine-tuned Helsinki Model)")
85
+ ],
86
  title="English to Hindi Translator",
87
+ description="Enter English text and get the Hindi translation from three different models: LSTM, Base Helsinki-NLP, and Fine-tuned Helsinki-NLP."
88
  )
89
 
90
  # Launch the Gradio app