Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -20,20 +20,20 @@ from functools import partial
|
|
20 |
|
21 |
from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
|
22 |
|
23 |
-
model_es = "Helsinki-NLP/opus-mt-
|
24 |
-
|
25 |
-
|
26 |
-
model_ar = "Helsinki-NLP/opus-mt-
|
27 |
|
28 |
tokenizer_es = AutoTokenizer.from_pretrained(model_es)
|
29 |
-
|
30 |
-
|
31 |
-
tokenizer_ar = AutoTokenizer.from_pretrained(model_ar)
|
32 |
|
33 |
model_tr_es = MarianMTModel.from_pretrained(model_es)
|
34 |
-
|
35 |
-
|
36 |
-
model_tr_ar = MarianMTModel.from_pretrained(model_ar)
|
37 |
|
38 |
from faiss import write_index, read_index
|
39 |
import pickle
|
@@ -53,30 +53,30 @@ def load_index(model):
|
|
53 |
|
54 |
dict_models = {
|
55 |
'en-es': model_es,
|
56 |
-
'en-
|
57 |
-
'en-
|
58 |
-
'en-ar': model_ar,
|
59 |
}
|
60 |
|
61 |
dict_models_tr = {
|
62 |
'en-es': model_tr_es,
|
63 |
-
'en-
|
64 |
-
'en-
|
65 |
-
'en-ar': model_tr_ar,
|
66 |
}
|
67 |
|
68 |
dict_tokenizer_tr = {
|
69 |
'en-es': tokenizer_es,
|
70 |
-
'en-
|
71 |
-
'en-
|
72 |
-
'en-ar': tokenizer_ar,
|
73 |
}
|
74 |
# dict_reference_faiss = {'en-es':[]}
|
75 |
dict_reference_faiss = {
|
76 |
-
'en-es': load_index('en-es'),
|
77 |
-
'en-ar': load_index('en-ar'),
|
78 |
-
'en-fr': load_index('en-fr'),
|
79 |
-
'en-zh': load_index('en-zh'),
|
80 |
}
|
81 |
|
82 |
# print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
|
@@ -714,16 +714,16 @@ html_embd_target= """
|
|
714 |
"""
|
715 |
|
716 |
html_att_enc = """
|
717 |
-
<div id="d3_att_enc">... Encoder self attention only
|
718 |
<div id="bertviz_enc"></div>
|
719 |
"""
|
720 |
|
721 |
html_att_cross = """
|
722 |
-
<div id="d3_att_cross">... Encoder-decoder cross attention only
|
723 |
"""
|
724 |
|
725 |
html_att_dec = """
|
726 |
-
<div id="d3_att_dec">...
|
727 |
"""
|
728 |
|
729 |
|
@@ -747,17 +747,17 @@ def first_function(w1, model):
|
|
747 |
all_sentences.append(params)
|
748 |
# print(len(params))
|
749 |
translated_text += params[0] + ' \n'
|
750 |
-
input_embeddings.append({
|
751 |
-
|
752 |
-
|
753 |
-
|
754 |
-
|
755 |
-
})
|
756 |
-
output_embeddings.append({
|
757 |
-
|
758 |
-
|
759 |
-
|
760 |
-
})
|
761 |
|
762 |
## load_reference; ERROR
|
763 |
## Build FAISS index
|
@@ -771,21 +771,21 @@ def first_function(w1, model):
|
|
771 |
# ---> preload faiss using the respective model with a initial dataset.
|
772 |
|
773 |
### to uncomment gg1 ###
|
774 |
-
result_search = {}
|
775 |
-
result_search['input'] = build_search(input_embeddings, model, type='input')
|
776 |
-
result_search['output'] = build_search(output_embeddings, model, type='output')
|
777 |
-
|
778 |
-
json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
|
779 |
-
dict_projected = {}
|
780 |
-
for type in ['input', 'output']:
|
781 |
-
|
782 |
-
|
783 |
-
|
784 |
-
|
785 |
-
|
786 |
-
|
787 |
-
|
788 |
-
|
789 |
## to uncomment gg1 ###
|
790 |
|
791 |
## bertviz
|
@@ -797,11 +797,12 @@ def first_function(w1, model):
|
|
797 |
html_att_cross = params[4][1]
|
798 |
|
799 |
### to uncomment gg1 ###
|
800 |
-
params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
|
801 |
-
### to uncomment gg1
|
|
|
|
|
802 |
|
803 |
-
|
804 |
-
# params.append([tgt, params['params'], params['html2'].data]
|
805 |
|
806 |
return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
|
807 |
|
@@ -815,7 +816,7 @@ def second_function(w1,j2):
|
|
815 |
with gr.Blocks(js="plotsjs.js") as demo:
|
816 |
gr.Markdown(
|
817 |
"""
|
818 |
-
# MAKE NMT
|
819 |
""")
|
820 |
|
821 |
gr.Markdown(
|
@@ -827,7 +828,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
|
|
827 |
"""
|
828 |
1. Select the language pair for the translation
|
829 |
""")
|
830 |
-
radio_c = gr.Radio(choices=['en
|
831 |
gr.Markdown(
|
832 |
"""
|
833 |
2. Source text to translate
|
@@ -851,10 +852,10 @@ with gr.Blocks(js="plotsjs.js") as demo:
|
|
851 |
with gr.Accordion("3. Review the source tokenization:", open=False):
|
852 |
input_tokenisation = gr.HTML(html_tok)
|
853 |
|
854 |
-
with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
|
855 |
-
|
856 |
|
857 |
-
with gr.Accordion("
|
858 |
gr.Markdown(
|
859 |
"""
|
860 |
`Bertviz `
|
@@ -871,10 +872,10 @@ with gr.Blocks(js="plotsjs.js") as demo:
|
|
871 |
with gr.Accordion("1. Review the target tokenization:", open=False):
|
872 |
target_tokenisation = gr.HTML(html_tok_target)
|
873 |
|
874 |
-
with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
|
875 |
-
|
876 |
|
877 |
-
with gr.Accordion("
|
878 |
gr.Markdown(
|
879 |
"""
|
880 |
`Bertviz -cross attention`
|
@@ -882,7 +883,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
|
|
882 |
input_embd= gr.HTML(html_att_cross)
|
883 |
cross_html = gr.HTML()
|
884 |
|
885 |
-
with gr.Accordion("
|
886 |
gr.Markdown(
|
887 |
"""
|
888 |
`Bertviz -dec attention`
|
@@ -890,7 +891,7 @@ with gr.Blocks(js="plotsjs.js") as demo:
|
|
890 |
input_embd= gr.HTML(html_att_dec)
|
891 |
dec_html = gr.HTML()
|
892 |
|
893 |
-
with gr.Accordion("
|
894 |
gr.Markdown(
|
895 |
"""
|
896 |
Generation process : `topk - beam search `
|
|
|
20 |
|
21 |
from transformers import AutoTokenizer, MarianTokenizer, AutoModel, AutoModelForSeq2SeqLM, MarianMTModel
|
22 |
|
23 |
+
model_es = "Helsinki-NLP/opus-mt-fr-es"
|
24 |
+
model_en = "Helsinki-NLP/opus-mt-fr-en"
|
25 |
+
model_de = "Helsinki-NLP/opus-mt-fr-de"
|
26 |
+
# model_ar = "Helsinki-NLP/opus-mt-fr-ar"
|
27 |
|
28 |
tokenizer_es = AutoTokenizer.from_pretrained(model_es)
|
29 |
+
tokenizer_en = AutoTokenizer.from_pretrained(model_en)
|
30 |
+
tokenizer_de = AutoTokenizer.from_pretrained(model_de)
|
31 |
+
# tokenizer_ar = AutoTokenizer.from_pretrained(model_ar)
|
32 |
|
33 |
model_tr_es = MarianMTModel.from_pretrained(model_es)
|
34 |
+
model_tr_en = MarianMTModel.from_pretrained(model_en)
|
35 |
+
model_tr_de = MarianMTModel.from_pretrained(model_de)
|
36 |
+
# model_tr_ar = MarianMTModel.from_pretrained(model_ar)
|
37 |
|
38 |
from faiss import write_index, read_index
|
39 |
import pickle
|
|
|
53 |
|
54 |
dict_models = {
|
55 |
'en-es': model_es,
|
56 |
+
'en-en': model_en,
|
57 |
+
'en-de': model_de,
|
58 |
+
# 'en-ar': model_ar,
|
59 |
}
|
60 |
|
61 |
dict_models_tr = {
|
62 |
'en-es': model_tr_es,
|
63 |
+
'en-en': model_tr_en,
|
64 |
+
'en-de': model_tr_de,
|
65 |
+
# 'en-ar': model_tr_ar,
|
66 |
}
|
67 |
|
68 |
dict_tokenizer_tr = {
|
69 |
'en-es': tokenizer_es,
|
70 |
+
'en-en': tokenizer_en,
|
71 |
+
'en-de': tokenizer_de,
|
72 |
+
# 'en-ar': tokenizer_ar,
|
73 |
}
|
74 |
# dict_reference_faiss = {'en-es':[]}
|
75 |
dict_reference_faiss = {
|
76 |
+
'en-es': [], #load_index('en-es'),
|
77 |
+
'en-ar': [], #load_index('en-ar'),
|
78 |
+
'en-fr': [], #load_index('en-fr'),
|
79 |
+
'en-zh': [], #load_index('en-zh'),
|
80 |
}
|
81 |
|
82 |
# print("dict", dict_reference_faiss['en-es']['input']['tokens'][1])
|
|
|
714 |
"""
|
715 |
|
716 |
html_att_enc = """
|
717 |
+
<div id="d3_att_enc">... Encoder self attention only ... Always read from left to right</div>
|
718 |
<div id="bertviz_enc"></div>
|
719 |
"""
|
720 |
|
721 |
html_att_cross = """
|
722 |
+
<div id="d3_att_cross">... Encoder-decoder cross attention only ...</div>
|
723 |
"""
|
724 |
|
725 |
html_att_dec = """
|
726 |
+
<div id="d3_att_dec">... Decoder self attention only ...</div>
|
727 |
"""
|
728 |
|
729 |
|
|
|
747 |
all_sentences.append(params)
|
748 |
# print(len(params))
|
749 |
translated_text += params[0] + ' \n'
|
750 |
+
# input_embeddings.append({
|
751 |
+
# 'embeddings': params[1][6].detach(), ## create a vocabulary with the set of embeddings
|
752 |
+
# 'tokens': params[1][3+2].tolist(), # one translation = one sentence
|
753 |
+
# # 'texts' : dict_tokenizer_tr[model].decode(params[2].tolist())
|
754 |
+
|
755 |
+
# })
|
756 |
+
# output_embeddings.append({
|
757 |
+
# 'embeddings' : params[1][7].detach(),
|
758 |
+
# 'tokens': params[1][3+1].sequences.tolist(),
|
759 |
+
# # 'texts' : dict_tokenizer_tr[model].decode(params[1].sequences.tolist())
|
760 |
+
# })
|
761 |
|
762 |
## load_reference; ERROR
|
763 |
## Build FAISS index
|
|
|
771 |
# ---> preload faiss using the respective model with a initial dataset.
|
772 |
|
773 |
### to uncomment gg1 ###
|
774 |
+
# result_search = {}
|
775 |
+
# result_search['input'] = build_search(input_embeddings, model, type='input')
|
776 |
+
# result_search['output'] = build_search(output_embeddings, model, type='output')
|
777 |
+
|
778 |
+
# json_out = {'input': {'tokens': {}, 'words': {}}, 'output': {'tokens': {}, 'words': {}}}
|
779 |
+
# dict_projected = {}
|
780 |
+
# for type in ['input', 'output']:
|
781 |
+
# dict_projected[type] = {}
|
782 |
+
# for key in ['tokens', 'words']:
|
783 |
+
# similar_key = result_search[type][key]['similar']
|
784 |
+
# vocab = result_search[type][key]['vocab_queries']
|
785 |
+
# dict_projected[type][key] = filtered_projection(similar_key, vocab, model, type=type, key=key)
|
786 |
+
# json_out[type][key]['similar_queries'] = similar_key
|
787 |
+
# json_out[type][key]['tnse'] = dict_projected[type][key]
|
788 |
+
# json_out[type][key]['key_text_list'] = result_search[type][key]['sentence_key_list']
|
789 |
## to uncomment gg1 ###
|
790 |
|
791 |
## bertviz
|
|
|
797 |
html_att_cross = params[4][1]
|
798 |
|
799 |
### to uncomment gg1 ###
|
800 |
+
# params = [params[0], params[1], json_out, params[2][0], params[3][0], params[4][0]]
|
801 |
+
### to uncomment gg1
|
802 |
+
params = [params[0], params[1], [], params[2][0], params[3][0], params[4][0]]
|
803 |
+
### to comment gg1 ###
|
804 |
|
805 |
+
# params.append([tgt, params['params'], params['html2'].data]
|
|
|
806 |
|
807 |
return [translated_text, params, html_att_enc, html_att_dec, html_att_cross]
|
808 |
|
|
|
816 |
with gr.Blocks(js="plotsjs.js") as demo:
|
817 |
gr.Markdown(
|
818 |
"""
|
819 |
+
# MAKE NMT Viz \t `Literacy task`
|
820 |
""")
|
821 |
|
822 |
gr.Markdown(
|
|
|
828 |
"""
|
829 |
1. Select the language pair for the translation
|
830 |
""")
|
831 |
+
radio_c = gr.Radio(choices=['fr-en', 'fr-es', 'fr-de'], value="fr-en", label= ['French to English', "French to Spanish", "French to German"], container=False)
|
832 |
gr.Markdown(
|
833 |
"""
|
834 |
2. Source text to translate
|
|
|
852 |
with gr.Accordion("3. Review the source tokenization:", open=False):
|
853 |
input_tokenisation = gr.HTML(html_tok)
|
854 |
|
855 |
+
# with gr.Accordion("4. Review similar source tokens in the embedding space:", open=False):
|
856 |
+
# input_embd= gr.HTML(html_embd)
|
857 |
|
858 |
+
with gr.Accordion("4. Review the attention between the source tokens:", open=False):
|
859 |
gr.Markdown(
|
860 |
"""
|
861 |
`Bertviz `
|
|
|
872 |
with gr.Accordion("1. Review the target tokenization:", open=False):
|
873 |
target_tokenisation = gr.HTML(html_tok_target)
|
874 |
|
875 |
+
# with gr.Accordion("2. Review similar target tokens in the embedding space:", open=False):
|
876 |
+
# target_embd= gr.HTML(html_embd_target)
|
877 |
|
878 |
+
with gr.Accordion("2. Review the attention between the target and source tokens:", open=False):
|
879 |
gr.Markdown(
|
880 |
"""
|
881 |
`Bertviz -cross attention`
|
|
|
883 |
input_embd= gr.HTML(html_att_cross)
|
884 |
cross_html = gr.HTML()
|
885 |
|
886 |
+
with gr.Accordion("3. Review the attention between the target tokens:", open=False):
|
887 |
gr.Markdown(
|
888 |
"""
|
889 |
`Bertviz -dec attention`
|
|
|
891 |
input_embd= gr.HTML(html_att_dec)
|
892 |
dec_html = gr.HTML()
|
893 |
|
894 |
+
with gr.Accordion("4. Review the alternative translations tokens:", open=False):
|
895 |
gr.Markdown(
|
896 |
"""
|
897 |
Generation process : `topk - beam search `
|