gabrielanicole commited on
Commit
3044398
·
verified ·
1 Parent(s): cf24fff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -66
app.py CHANGED
@@ -20,20 +20,20 @@ from functools import partial
20
 
21
  from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
22
 
23
- model_es = "Helsinki-NLP/opus-mt-en-es"
24
- model_fr = "Helsinki-NLP/opus-mt-en-fr"
25
- model_zh = "Helsinki-NLP/opus-mt-en-zh"
26
- model_ar = "Helsinki-NLP/opus-mt-en-ar"
27
 
28
  tokenizer_es = AutoTokenizer.from_pretrained(model_es)
29
- tokenizer_fr = AutoTokenizer.from_pretrained(model_fr)
30
- tokenizer_zh = AutoTokenizer.from_pretrained(model_zh)
31
- tokenizer_ar = AutoTokenizer.from_pretrained(model_ar)
32
 
33
  model_tr_es = MarianMTModel.from_pretrained(model_es)
34
- model_tr_fr = MarianMTModel.from_pretrained(model_fr)
35
- model_tr_zh = MarianMTModel.from_pretrained(model_zh)
36
- model_tr_ar = MarianMTModel.from_pretrained(model_ar)
37
 
38
  from faiss import write_index, read_index
39
  import pickle
@@ -53,30 +53,30 @@ def load_index(model):
53
 
54
  dict_models = {
55
  'en-es': model_es,
56
- 'en-fr': model_fr,
57
- 'en-zh': model_zh,
58
- 'en-ar': model_ar,
59
  }
60
 
61
  dict_models_tr = {
62
  'en-es': model_tr_es,
63
- 'en-fr': model_tr_fr,
64
- 'en-zh': model_tr_zh,
65
- 'en-ar': model_tr_ar,
66
  }
67
 
68
  dict_tokenizer_tr = {
69
  'en-es': tokenizer_es,
70
- 'en-fr': tokenizer_fr,
71
- 'en-zh': tokenizer_zh,
72
- 'en-ar': tokenizer_ar,
73
  }
74
  # dict_reference_faiss = {'en-es':[]}
75
  dict_reference_faiss = {
76
- 'en-es': load_index('en-es'),
77
- 'en-ar': load_index('en-ar'),
78
- 'en-fr': load_index('en-fr'),
79
- 'en-zh': load_index('en-zh'),
80
  }
81
 
82
  # print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
@@ -714,16 +714,16 @@ html_embd_target= """
714
  """
715
 
716
  html_att_enc = """
717
- <div id="d3_att_enc">... Encoder self attention only -- last layer and mean across heads ... Always read from left to right</div>
718
  <div id="bertviz_enc"></div>
719
  """
720
 
721
  html_att_cross = """
722
- <div id="d3_att_cross">... Encoder-decoder cross attention only -- last layer and mean across heads ...</div>
723
  """
724
 
725
  html_att_dec = """
726
- <div id="d3_att_dec">... decoder self attention only -- last layer and mean across heads ...</div>
727
  """
728
 
729
 
@@ -747,17 +747,17 @@ def first_function(w1, model):
747
  all_sentences.append(params)
748
  # print(len(params))
749
  translated_text += params[0] + ' \n'
750
- input_embeddings.append({
751
- 'embeddings': params[1][6].detach(), ## create a vocabulary with the set of embeddings
752
- 'tokens': params[1][3+2].tolist(), # one translation = one sentence
753
- # 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist())
754
-
755
- })
756
- output_embeddings.append({
757
- 'embeddings' : params[1][7].detach(),
758
- 'tokens': params[1][3+1].sequences.tolist(),
759
- # 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
760
- })
761
 
762
  ## load_reference; ERROR
763
  ## Build FAISS index
@@ -771,21 +771,21 @@ def first_function(w1, model):
771
  # ---> preload faiss using the respective model with a initial dataset.
772
 
773
  ### to uncomment gg1 ###
774
- result_search = {}
775
- result_search['input'] = build_search(input_embeddings, model, type='input')
776
- result_search['output'] = build_search(output_embeddings, model, type='output')
777
-
778
- json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
779
- dict_projected = {}
780
- for type in ['input', 'output']:
781
- dict_projected[type] = {}
782
- for key in ['tokens', 'words']:
783
- similar_key = result_search[type][key]['similar']
784
- vocab = result_search[type][key]['vocab_queries']
785
- dict_projected[type][key] = filtered_projection(similar_key, vocab, model, type=type, key=key)
786
- json_out[type][key]['similar_queries'] = similar_key
787
- json_out[type][key]['tnse'] = dict_projected[type][key]
788
- json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
789
  ## to uncomment gg1 ###
790
 
791
  ## bertviz
@@ -797,11 +797,12 @@ def first_function(w1, model):
797
  html_att_cross = params[4][1]
798
 
799
  ### to uncomment gg1 ###
800
- params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
801
- ### to uncomment gg1 ###
 
 
802
 
803
- # params = [params[0], params[1], [], params[2][0], params[3][0], params[4][0]]
804
- # params.append([tgt, params['params'], params['html2'].data]
805
 
806
  return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
807
 
@@ -815,7 +816,7 @@ def second_function(w1,j2):
815
  with gr.Blocks(js="plotsjs.js") as demo:
816
  gr.Markdown(
817
  """
818
- # MAKE NMT Workshop \t `Literacy task`
819
  """)
820
 
821
  gr.Markdown(
@@ -827,7 +828,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
827
  """
828
  1. Select the language pair for the translation
829
  """)
830
- radio_c = gr.Radio(choices=['en-zh', 'en-es', 'en-fr', 'en-ar'], value="en-es", label= '', container=False)
831
  gr.Markdown(
832
  """
833
  2. Source text to translate
@@ -851,10 +852,10 @@ with gr.Blocks(js="plotsjs.js") as demo:
851
  with gr.Accordion("3. Review the source tokenization:", open=False):
852
  input_tokenisation = gr.HTML(html_tok)
853
 
854
- with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
855
- input_embd= gr.HTML(html_embd)
856
 
857
- with gr.Accordion("5. Review the attention between the source tokens:", open=False):
858
  gr.Markdown(
859
  """
860
  `Bertviz `
@@ -871,10 +872,10 @@ with gr.Blocks(js="plotsjs.js") as demo:
871
  with gr.Accordion("1. Review the target tokenization:", open=False):
872
  target_tokenisation = gr.HTML(html_tok_target)
873
 
874
- with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
875
- target_embd= gr.HTML(html_embd_target)
876
 
877
- with gr.Accordion("3. Review the attention between the target and source tokens:", open=False):
878
  gr.Markdown(
879
  """
880
  `Bertviz -cross attention`
@@ -882,7 +883,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
882
  input_embd= gr.HTML(html_att_cross)
883
  cross_html = gr.HTML()
884
 
885
- with gr.Accordion("4. Review the attention between the target tokens:", open=False):
886
  gr.Markdown(
887
  """
888
  `Bertviz -dec attention`
@@ -890,7 +891,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
890
  input_embd= gr.HTML(html_att_dec)
891
  dec_html = gr.HTML()
892
 
893
- with gr.Accordion("6. Review the alternative translations tokens:", open=False):
894
  gr.Markdown(
895
  """
896
  Generation process : `topk - beam search `
 
20
 
21
  from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
22
 
23
+ model_es = "Helsinki-NLP/opus-mt-fr-es"
24
+ model_en = "Helsinki-NLP/opus-mt-fr-en"
25
+ model_de = "Helsinki-NLP/opus-mt-fr-de"
26
+ # model_ar = "Helsinki-NLP/opus-mt-fr-ar"
27
 
28
  tokenizer_es = AutoTokenizer.from_pretrained(model_es)
29
+ tokenizer_en = AutoTokenizer.from_pretrained(model_en)
30
+ tokenizer_de = AutoTokenizer.from_pretrained(model_de)
31
+ # tokenizer_ar = AutoTokenizer.from_pretrained(model_ar)
32
 
33
  model_tr_es = MarianMTModel.from_pretrained(model_es)
34
+ model_tr_en = MarianMTModel.from_pretrained(model_en)
35
+ model_tr_de = MarianMTModel.from_pretrained(model_de)
36
+ # model_tr_ar = MarianMTModel.from_pretrained(model_ar)
37
 
38
  from faiss import write_index, read_index
39
  import pickle
 
53
 
54
  dict_models = {
55
  'en-es': model_es,
56
+ 'en-en': model_en,
57
+ 'en-de': model_de,
58
+ # 'en-ar': model_ar,
59
  }
60
 
61
  dict_models_tr = {
62
  'en-es': model_tr_es,
63
+ 'en-en': model_tr_en,
64
+ 'en-de': model_tr_de,
65
+ # 'en-ar': model_tr_ar,
66
  }
67
 
68
  dict_tokenizer_tr = {
69
  'en-es': tokenizer_es,
70
+ 'en-en': tokenizer_en,
71
+ 'en-de': tokenizer_de,
72
+ # 'en-ar': tokenizer_ar,
73
  }
74
  # dict_reference_faiss = {'en-es':[]}
75
  dict_reference_faiss = {
76
+ 'en-es': [], #load_index('en-es'),
77
+ 'en-ar': [], #load_index('en-ar'),
78
+ 'en-fr': [], #load_index('en-fr'),
79
+ 'en-zh': [], #load_index('en-zh'),
80
  }
81
 
82
  # print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
 
714
  """
715
 
716
  html_att_enc = """
717
+ <div id="d3_att_enc">... Encoder self attention only ... Always read from left to right</div>
718
  <div id="bertviz_enc"></div>
719
  """
720
 
721
  html_att_cross = """
722
+ <div id="d3_att_cross">... Encoder-decoder cross attention only ...</div>
723
  """
724
 
725
  html_att_dec = """
726
+ <div id="d3_att_dec">... Decoder self attention only ...</div>
727
  """
728
 
729
 
 
747
  all_sentences.append(params)
748
  # print(len(params))
749
  translated_text += params[0] + ' \n'
750
+ # input_embeddings.append({
751
+ # 'embeddings': params[1][6].detach(), ## create a vocabulary with the set of embeddings
752
+ # 'tokens': params[1][3+2].tolist(), # one translation = one sentence
753
+ # # 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist())
754
+
755
+ # })
756
+ # output_embeddings.append({
757
+ # 'embeddings' : params[1][7].detach(),
758
+ # 'tokens': params[1][3+1].sequences.tolist(),
759
+ # # 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
760
+ # })
761
 
762
  ## load_reference; ERROR
763
  ## Build FAISS index
 
771
  # ---> preload faiss using the respective model with a initial dataset.
772
 
773
  ### to uncomment gg1 ###
774
+ # result_search = {}
775
+ # result_search['input'] = build_search(input_embeddings, model, type='input')
776
+ # result_search['output'] = build_search(output_embeddings, model, type='output')
777
+
778
+ # json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
779
+ # dict_projected = {}
780
+ # for type in ['input', 'output']:
781
+ # dict_projected[type] = {}
782
+ # for key in ['tokens', 'words']:
783
+ # similar_key = result_search[type][key]['similar']
784
+ # vocab = result_search[type][key]['vocab_queries']
785
+ # dict_projected[type][key] = filtered_projection(similar_key, vocab, model, type=type, key=key)
786
+ # json_out[type][key]['similar_queries'] = similar_key
787
+ # json_out[type][key]['tnse'] = dict_projected[type][key]
788
+ # json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
789
  ## to uncomment gg1 ###
790
 
791
  ## bertviz
 
797
  html_att_cross = params[4][1]
798
 
799
  ### to uncomment gg1 ###
800
+ # params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
801
+ ### to uncomment gg1
802
+ params = [params[0], params[1], [], params[2][0], params[3][0], params[4][0]]
803
+ ### to comment gg1 ###
804
 
805
+ # params.append([tgt, params['params'], params['html2'].data]
 
806
 
807
  return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
808
 
 
816
  with gr.Blocks(js="plotsjs.js") as demo:
817
  gr.Markdown(
818
  """
819
+ # MAKE NMT Viz \t `Literacy task`
820
  """)
821
 
822
  gr.Markdown(
 
828
  """
829
  1. Select the language pair for the translation
830
  """)
831
+ radio_c = gr.Radio(choices=['fr-en', 'fr-es', 'fr-de'], value="fr-en", label= ['French to English', "French to Spanish", "French to German"], container=False)
832
  gr.Markdown(
833
  """
834
  2. Source text to translate
 
852
  with gr.Accordion("3. Review the source tokenization:", open=False):
853
  input_tokenisation = gr.HTML(html_tok)
854
 
855
+ # with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
856
+ # input_embd= gr.HTML(html_embd)
857
 
858
+ with gr.Accordion("4. Review the attention between the source tokens:", open=False):
859
  gr.Markdown(
860
  """
861
  `Bertviz `
 
872
  with gr.Accordion("1. Review the target tokenization:", open=False):
873
  target_tokenisation = gr.HTML(html_tok_target)
874
 
875
+ # with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
876
+ # target_embd= gr.HTML(html_embd_target)
877
 
878
+ with gr.Accordion("2. Review the attention between the target and source tokens:", open=False):
879
  gr.Markdown(
880
  """
881
  `Bertviz -cross attention`
 
883
  input_embd= gr.HTML(html_att_cross)
884
  cross_html = gr.HTML()
885
 
886
+ with gr.Accordion("3. Review the attention between the target tokens:", open=False):
887
  gr.Markdown(
888
  """
889
  `Bertviz -dec attention`
 
891
  input_embd= gr.HTML(html_att_dec)
892
  dec_html = gr.HTML()
893
 
894
+ with gr.Accordion("4. Review the alternative translations tokens:", open=False):
895
  gr.Markdown(
896
  """
897
  Generation process : `topk - beam search `