demo_fr

Running

App Files Files Community

gabrielanicole commited on Jan 27

Commit

3044398

verified ·

1 Parent(s): cf24fff

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -66

app.py CHANGED Viewed

@@ -20,20 +20,20 @@ from functools import partial
 from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
-model_es = "Helsinki-NLP/opus-mt-en-es"
-model_fr = "Helsinki-NLP/opus-mt-en-fr"
-model_zh = "Helsinki-NLP/opus-mt-en-zh"
-model_ar = "Helsinki-NLP/opus-mt-en-ar"
 tokenizer_es = AutoTokenizer.from_pretrained(model_es)
-tokenizer_fr = AutoTokenizer.from_pretrained(model_fr)
-tokenizer_zh = AutoTokenizer.from_pretrained(model_zh)
-tokenizer_ar = AutoTokenizer.from_pretrained(model_ar)
 model_tr_es = MarianMTModel.from_pretrained(model_es)
-model_tr_fr = MarianMTModel.from_pretrained(model_fr)
-model_tr_zh = MarianMTModel.from_pretrained(model_zh)
-model_tr_ar = MarianMTModel.from_pretrained(model_ar)
 from faiss import write_index, read_index
 import pickle
@@ -53,30 +53,30 @@ def load_index(model):
 dict_models = {
 	'en-es': model_es,
-	'en-fr': model_fr,
-	'en-zh': model_zh,
-	'en-ar': model_ar,
 }
 dict_models_tr = {
 	'en-es': model_tr_es,
-	'en-fr': model_tr_fr,
-	'en-zh': model_tr_zh,
-	'en-ar': model_tr_ar,
 }
 dict_tokenizer_tr = {
 	'en-es': tokenizer_es,
-	'en-fr': tokenizer_fr,
-	'en-zh': tokenizer_zh,
-	'en-ar': tokenizer_ar,
 }
 # dict_reference_faiss = {'en-es':[]}
 dict_reference_faiss = {
-	'en-es': load_index('en-es'),
-	'en-ar': load_index('en-ar'),
-	'en-fr': load_index('en-fr'),
-	'en-zh': load_index('en-zh'),
 }
 # print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
@@ -714,16 +714,16 @@ html_embd_target= """
  """
 html_att_enc =  """
- <div id="d3_att_enc">... Encoder self attention only -- last layer and mean across heads ... Always read from left to right</div>
  <div id="bertviz_enc"></div>
  """
 html_att_cross =  """
- <div id="d3_att_cross">... Encoder-decoder cross attention only -- last layer and mean across heads ...</div>
  """
 html_att_dec =  """
- <div id="d3_att_dec">... decoder self attention only -- last layer and mean across heads ...</div>
  """
@@ -747,17 +747,17 @@ def first_function(w1, model):
 		all_sentences.append(params)
 		# print(len(params))
 		translated_text +=  params[0] + ' \n'
-		input_embeddings.append({
-			'embeddings': params[1][6].detach(), ## create a vocabulary with the set of embeddings
-			'tokens': params[1][3+2].tolist(), # one translation = one sentence
-			# 'texts' : 	dict_tokenizer_tr[model].decode(params[2].tolist())
-		})
-		output_embeddings.append({
-			'embeddings' : params[1][7].detach(),
-			'tokens': params[1][3+1].sequences.tolist(),
-			# 'texts' : 	dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
-		})
 	## load_reference; ERROR
 	## Build FAISS index
@@ -771,21 +771,21 @@ def first_function(w1, model):
 	# ---> preload faiss using the respective model with a initial dataset.
     ### to uncomment gg1 ###
-	result_search = {}
-	result_search['input'] = build_search(input_embeddings, model, type='input')
-	result_search['output'] = build_search(output_embeddings, model, type='output')
-	json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
-	dict_projected = {}
-	for type in ['input', 'output']:
-		dict_projected[type] = {}
-		for key in ['tokens', 'words']:
-			similar_key = result_search[type][key]['similar']
-			vocab = result_search[type][key]['vocab_queries']
-			dict_projected[type][key] =  filtered_projection(similar_key, vocab, model, type=type, key=key)
-			json_out[type][key]['similar_queries'] = similar_key
-			json_out[type][key]['tnse'] = dict_projected[type][key]
-			json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
     ## to uncomment gg1 ###
 	## bertviz
@@ -797,11 +797,12 @@ def first_function(w1, model):
 	html_att_cross = params[4][1]
     ### to uncomment gg1 ###
-	params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
-    ### to uncomment gg1 ###
-	# params = [params[0], params[1], [], params[2][0], params[3][0], params[4][0]]
-	# params.append([tgt, params['params'], params['html2'].data]
 	return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
@@ -815,7 +816,7 @@ def second_function(w1,j2):
 with gr.Blocks(js="plotsjs.js") as demo:
 	gr.Markdown(
 	"""
-	# MAKE NMT Workshop \t `Literacy task`
 	""")
 	gr.Markdown(
@@ -827,7 +828,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
 		"""
 		1. Select the language pair for the translation
 		""")
-	radio_c = gr.Radio(choices=['en-zh', 'en-es', 'en-fr', 'en-ar'], value="en-es", label= '', container=False)
 	gr.Markdown(
 		"""
 		2. Source text to translate
@@ -851,10 +852,10 @@ with gr.Blocks(js="plotsjs.js") as demo:
 	with gr.Accordion("3. Review the source tokenization:", open=False):
 		input_tokenisation = gr.HTML(html_tok)
-	with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
-		input_embd= gr.HTML(html_embd)
-	with gr.Accordion("5. Review the attention between the source tokens:", open=False):
 		gr.Markdown(
 			"""
 			`Bertviz `
@@ -871,10 +872,10 @@ with gr.Blocks(js="plotsjs.js") as demo:
 	with gr.Accordion("1. Review the target tokenization:", open=False):
 		target_tokenisation = gr.HTML(html_tok_target)
-	with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
-		target_embd= gr.HTML(html_embd_target)
-	with gr.Accordion("3. Review the attention between the target and source tokens:", open=False):
 		gr.Markdown(
 			"""
 			`Bertviz -cross attention`
@@ -882,7 +883,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
 		input_embd= gr.HTML(html_att_cross)
 		cross_html = gr.HTML()
-	with gr.Accordion("4. Review the attention between the target tokens:", open=False):
 		gr.Markdown(
 			"""
 			`Bertviz -dec attention`
@@ -890,7 +891,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
 		input_embd= gr.HTML(html_att_dec)
 		dec_html = gr.HTML()
-	with gr.Accordion("6. Review the alternative translations tokens:", open=False):
 		gr.Markdown(
 			"""
 			Generation process : `topk - beam search `

 from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
+model_es = "Helsinki-NLP/opus-mt-fr-es"
+model_en = "Helsinki-NLP/opus-mt-fr-en"
+model_de = "Helsinki-NLP/opus-mt-fr-de"
+# model_ar = "Helsinki-NLP/opus-mt-fr-ar"
 tokenizer_es = AutoTokenizer.from_pretrained(model_es)
+tokenizer_en = AutoTokenizer.from_pretrained(model_en)
+tokenizer_de = AutoTokenizer.from_pretrained(model_de)
+# tokenizer_ar = AutoTokenizer.from_pretrained(model_ar)
 model_tr_es = MarianMTModel.from_pretrained(model_es)
+model_tr_en = MarianMTModel.from_pretrained(model_en)
+model_tr_de = MarianMTModel.from_pretrained(model_de)
+# model_tr_ar = MarianMTModel.from_pretrained(model_ar)
 from faiss import write_index, read_index
 import pickle
 dict_models = {
 	'en-es': model_es,
+	'en-en': model_en,
+	'en-de': model_de,
+	# 'en-ar': model_ar,
 }
 dict_models_tr = {
 	'en-es': model_tr_es,
+	'en-en': model_tr_en,
+	'en-de': model_tr_de,
+	# 'en-ar': model_tr_ar,
 }
 dict_tokenizer_tr = {
 	'en-es': tokenizer_es,
+	'en-en': tokenizer_en,
+	'en-de': tokenizer_de,
+	# 'en-ar': tokenizer_ar,
 }
 # dict_reference_faiss = {'en-es':[]}
 dict_reference_faiss = {
+	'en-es': [],  #load_index('en-es'),
+	'en-ar': [], #load_index('en-ar'),
+	'en-fr': [], #load_index('en-fr'),
+	'en-zh': [], #load_index('en-zh'),
 }
 # print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
  """
 html_att_enc =  """
+ <div id="d3_att_enc">... Encoder self attention only ... Always read from left to right</div>
  <div id="bertviz_enc"></div>
  """
 html_att_cross =  """
+ <div id="d3_att_cross">... Encoder-decoder cross attention only ...</div>
  """
 html_att_dec =  """
+ <div id="d3_att_dec">... Decoder self attention only ...</div>
  """
 		all_sentences.append(params)
 		# print(len(params))
 		translated_text +=  params[0] + ' \n'
+		# input_embeddings.append({
+		# 	'embeddings': params[1][6].detach(), ## create a vocabulary with the set of embeddings
+		# 	'tokens': params[1][3+2].tolist(), # one translation = one sentence
+		# 	# 'texts' : 	dict_tokenizer_tr[model].decode(params[2].tolist())
+		# })
+		# output_embeddings.append({
+		# 	'embeddings' : params[1][7].detach(),
+		# 	'tokens': params[1][3+1].sequences.tolist(),
+		# 	# 'texts' : 	dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
+		# })
 	## load_reference; ERROR
 	## Build FAISS index
 	# ---> preload faiss using the respective model with a initial dataset.
     ### to uncomment gg1 ###
+	# result_search = {}
+	# result_search['input'] = build_search(input_embeddings, model, type='input')
+	# result_search['output'] = build_search(output_embeddings, model, type='output')
+	# json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
+	# dict_projected = {}
+	# for type in ['input', 'output']:
+	# 	dict_projected[type] = {}
+	# 	for key in ['tokens', 'words']:
+	# 		similar_key = result_search[type][key]['similar']
+	# 		vocab = result_search[type][key]['vocab_queries']
+	# 		dict_projected[type][key] =  filtered_projection(similar_key, vocab, model, type=type, key=key)
+	# 		json_out[type][key]['similar_queries'] = similar_key
+	# 		json_out[type][key]['tnse'] = dict_projected[type][key]
+	# 		json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
     ## to uncomment gg1 ###
 	## bertviz
 	html_att_cross = params[4][1]
     ### to uncomment gg1 ###
+	# params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
+    ### to uncomment gg1
+	params = [params[0], params[1], [], params[2][0], params[3][0], params[4][0]]
+	### to comment gg1 ###
+    # params.append([tgt, params['params'], params['html2'].data]
 	return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
 with gr.Blocks(js="plotsjs.js") as demo:
 	gr.Markdown(
 	"""
+	# MAKE NMT Viz \t `Literacy task`
 	""")
 	gr.Markdown(
 		"""
 		1. Select the language pair for the translation
 		""")
+	radio_c = gr.Radio(choices=['fr-en', 'fr-es', 'fr-de'], value="fr-en", label= ['French to English', "French to Spanish", "French to German"], container=False)
 	gr.Markdown(
 		"""
 		2. Source text to translate
 	with gr.Accordion("3. Review the source tokenization:", open=False):
 		input_tokenisation = gr.HTML(html_tok)
+	# with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
+	# 	input_embd= gr.HTML(html_embd)
+	with gr.Accordion("4. Review the attention between the source tokens:", open=False):
 		gr.Markdown(
 			"""
 			`Bertviz `
 	with gr.Accordion("1. Review the target tokenization:", open=False):
 		target_tokenisation = gr.HTML(html_tok_target)
+	# with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
+	# 	target_embd= gr.HTML(html_embd_target)
+	with gr.Accordion("2. Review the attention between the target and source tokens:", open=False):
 		gr.Markdown(
 			"""
 			`Bertviz -cross attention`
 		input_embd= gr.HTML(html_att_cross)
 		cross_html = gr.HTML()
+	with gr.Accordion("3. Review the attention between the target tokens:", open=False):
 		gr.Markdown(
 			"""
 			`Bertviz -dec attention`
 		input_embd= gr.HTML(html_att_dec)
 		dec_html = gr.HTML()
+	with gr.Accordion("4. Review the alternative translations tokens:", open=False):
 		gr.Markdown(
 			"""
 			Generation process : `topk - beam search `