Spaces:
Runtime error
Runtime error
Hussain Shaikh
commited on
Commit
·
7edceed
1
Parent(s):
6325f49
final commit added required files
Browse files- .gitignore +143 -0
- api/api.py +152 -0
- api/punctuate.py +220 -0
- app.py +27 -4
- inference/__init__.py +0 -0
- inference/custom_interactive.py +298 -0
- inference/engine.py +198 -0
- legacy/apply_bpe_test_valid_notag.sh +33 -0
- legacy/apply_bpe_train_notag.sh +33 -0
- legacy/env.sh +17 -0
- legacy/indictrans_workflow.ipynb +643 -0
- legacy/install_fairseq.sh +45 -0
- legacy/run_inference.sh +80 -0
- legacy/run_joint_inference.sh +74 -0
- legacy/tpu_training_instructions.md +92 -0
- legacy/translate.sh +70 -0
- model_configs/__init__.py +1 -0
- model_configs/custom_transformer.py +38 -0
- requirements.txt +11 -0
- scripts/__init__.py +0 -0
- scripts/add_joint_tags_translate.py +61 -0
- scripts/add_tags_translate.py +33 -0
- scripts/clean_vocab.py +19 -0
- scripts/concat_joint_data.py +130 -0
- scripts/extract_non_english_pairs.py +108 -0
- scripts/postprocess_score.py +48 -0
- scripts/postprocess_translate.py +110 -0
- scripts/preprocess_translate.py +172 -0
- scripts/remove_large_sentences.py +44 -0
- scripts/remove_train_devtest_overlaps.py +265 -0
.gitignore
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#ignore libs folder we use
|
2 |
+
indic_nlp_library
|
3 |
+
indic_nlp_resources
|
4 |
+
subword-nmt
|
5 |
+
|
6 |
+
# Byte-compiled / optimized / DLL files
|
7 |
+
__pycache__/
|
8 |
+
*.py[cod]
|
9 |
+
*$py.class
|
10 |
+
|
11 |
+
# C extensions
|
12 |
+
*.so
|
13 |
+
|
14 |
+
# Distribution / packaging
|
15 |
+
.Python
|
16 |
+
build/
|
17 |
+
develop-eggs/
|
18 |
+
dist/
|
19 |
+
downloads/
|
20 |
+
eggs/
|
21 |
+
.eggs/
|
22 |
+
lib/
|
23 |
+
lib64/
|
24 |
+
parts/
|
25 |
+
sdist/
|
26 |
+
var/
|
27 |
+
wheels/
|
28 |
+
share/python-wheels/
|
29 |
+
*.egg-info/
|
30 |
+
.installed.cfg
|
31 |
+
*.egg
|
32 |
+
MANIFEST
|
33 |
+
|
34 |
+
# PyInstaller
|
35 |
+
# Usually these files are written by a python script from a template
|
36 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
37 |
+
*.manifest
|
38 |
+
*.spec
|
39 |
+
|
40 |
+
# Installer logs
|
41 |
+
pip-log.txt
|
42 |
+
pip-delete-this-directory.txt
|
43 |
+
|
44 |
+
# Unit test / coverage reports
|
45 |
+
htmlcov/
|
46 |
+
.tox/
|
47 |
+
.nox/
|
48 |
+
.coverage
|
49 |
+
.coverage.*
|
50 |
+
.cache
|
51 |
+
nosetests.xml
|
52 |
+
coverage.xml
|
53 |
+
*.cover
|
54 |
+
*.py,cover
|
55 |
+
.hypothesis/
|
56 |
+
.pytest_cache/
|
57 |
+
cover/
|
58 |
+
|
59 |
+
# Translations
|
60 |
+
*.mo
|
61 |
+
*.pot
|
62 |
+
|
63 |
+
# Django stuff:
|
64 |
+
*.log
|
65 |
+
local_settings.py
|
66 |
+
db.sqlite3
|
67 |
+
db.sqlite3-journal
|
68 |
+
|
69 |
+
# Flask stuff:
|
70 |
+
instance/
|
71 |
+
.webassets-cache
|
72 |
+
|
73 |
+
# Scrapy stuff:
|
74 |
+
.scrapy
|
75 |
+
|
76 |
+
# Sphinx documentation
|
77 |
+
docs/_build/
|
78 |
+
|
79 |
+
# PyBuilder
|
80 |
+
.pybuilder/
|
81 |
+
target/
|
82 |
+
|
83 |
+
# Jupyter Notebook
|
84 |
+
.ipynb_checkpoints
|
85 |
+
|
86 |
+
# IPython
|
87 |
+
profile_default/
|
88 |
+
ipython_config.py
|
89 |
+
|
90 |
+
# pyenv
|
91 |
+
# For a library or package, you might want to ignore these files since the code is
|
92 |
+
# intended to run in multiple environments; otherwise, check them in:
|
93 |
+
# .python-version
|
94 |
+
|
95 |
+
# pipenv
|
96 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
97 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
98 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
99 |
+
# install all needed dependencies.
|
100 |
+
#Pipfile.lock
|
101 |
+
|
102 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
103 |
+
__pypackages__/
|
104 |
+
|
105 |
+
# Celery stuff
|
106 |
+
celerybeat-schedule
|
107 |
+
celerybeat.pid
|
108 |
+
|
109 |
+
# SageMath parsed files
|
110 |
+
*.sage.py
|
111 |
+
|
112 |
+
# Environments
|
113 |
+
.env
|
114 |
+
.venv
|
115 |
+
env/
|
116 |
+
venv/
|
117 |
+
ENV/
|
118 |
+
env.bak/
|
119 |
+
venv.bak/
|
120 |
+
|
121 |
+
# Spyder project settings
|
122 |
+
.spyderproject
|
123 |
+
.spyproject
|
124 |
+
|
125 |
+
# Rope project settings
|
126 |
+
.ropeproject
|
127 |
+
|
128 |
+
# mkdocs documentation
|
129 |
+
/site
|
130 |
+
|
131 |
+
# mypy
|
132 |
+
.mypy_cache/
|
133 |
+
.dmypy.json
|
134 |
+
dmypy.json
|
135 |
+
|
136 |
+
# Pyre type checker
|
137 |
+
.pyre/
|
138 |
+
|
139 |
+
# pytype static type analyzer
|
140 |
+
.pytype/
|
141 |
+
|
142 |
+
# Cython debug symbols
|
143 |
+
cython_debug/
|
api/api.py
ADDED
@@ -0,0 +1,152 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
|
3 |
+
import re
|
4 |
+
from math import floor, ceil
|
5 |
+
from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
|
6 |
+
# from nltk.tokenize import sent_tokenize
|
7 |
+
from flask import Flask, request, jsonify
|
8 |
+
from flask_cors import CORS, cross_origin
|
9 |
+
import webvtt
|
10 |
+
from io import StringIO
|
11 |
+
from mosestokenizer import MosesSentenceSplitter
|
12 |
+
|
13 |
+
from indicTrans.inference.engine import Model
|
14 |
+
from punctuate import RestorePuncts
|
15 |
+
from indicnlp.tokenize.sentence_tokenize import sentence_split
|
16 |
+
|
17 |
+
app = Flask(__name__)
|
18 |
+
cors = CORS(app)
|
19 |
+
app.config['CORS_HEADERS'] = 'Content-Type'
|
20 |
+
|
21 |
+
indic2en_model = Model(expdir='models/v3/indic-en')
|
22 |
+
en2indic_model = Model(expdir='models/v3/en-indic')
|
23 |
+
m2m_model = Model(expdir='models/m2m')
|
24 |
+
|
25 |
+
rpunct = RestorePuncts()
|
26 |
+
|
27 |
+
indic_language_dict = {
|
28 |
+
'Assamese': 'as',
|
29 |
+
'Hindi' : 'hi',
|
30 |
+
'Marathi' : 'mr',
|
31 |
+
'Tamil' : 'ta',
|
32 |
+
'Bengali' : 'bn',
|
33 |
+
'Kannada' : 'kn',
|
34 |
+
'Oriya' : 'or',
|
35 |
+
'Telugu' : 'te',
|
36 |
+
'Gujarati' : 'gu',
|
37 |
+
'Malayalam' : 'ml',
|
38 |
+
'Punjabi' : 'pa',
|
39 |
+
}
|
40 |
+
|
41 |
+
splitter = MosesSentenceSplitter('en')
|
42 |
+
|
43 |
+
def get_inference_params():
|
44 |
+
source_language = request.form['source_language']
|
45 |
+
target_language = request.form['target_language']
|
46 |
+
|
47 |
+
if source_language in indic_language_dict and target_language == 'English':
|
48 |
+
model = indic2en_model
|
49 |
+
source_lang = indic_language_dict[source_language]
|
50 |
+
target_lang = 'en'
|
51 |
+
elif source_language == 'English' and target_language in indic_language_dict:
|
52 |
+
model = en2indic_model
|
53 |
+
source_lang = 'en'
|
54 |
+
target_lang = indic_language_dict[target_language]
|
55 |
+
elif source_language in indic_language_dict and target_language in indic_language_dict:
|
56 |
+
model = m2m_model
|
57 |
+
source_lang = indic_language_dict[source_language]
|
58 |
+
target_lang = indic_language_dict[target_language]
|
59 |
+
|
60 |
+
return model, source_lang, target_lang
|
61 |
+
|
62 |
+
@app.route('/', methods=['GET'])
|
63 |
+
def main():
|
64 |
+
return "IndicTrans API"
|
65 |
+
|
66 |
+
@app.route('/supported_languages', methods=['GET'])
|
67 |
+
@cross_origin()
|
68 |
+
def supported_languages():
|
69 |
+
return jsonify(indic_language_dict)
|
70 |
+
|
71 |
+
@app.route("/translate", methods=['POST'])
|
72 |
+
@cross_origin()
|
73 |
+
def infer_indic_en():
|
74 |
+
model, source_lang, target_lang = get_inference_params()
|
75 |
+
source_text = request.form['text']
|
76 |
+
|
77 |
+
start_time = time.time()
|
78 |
+
target_text = model.translate_paragraph(source_text, source_lang, target_lang)
|
79 |
+
end_time = time.time()
|
80 |
+
return {'text':target_text, 'duration':round(end_time-start_time, 2)}
|
81 |
+
|
82 |
+
@app.route("/translate_vtt", methods=['POST'])
|
83 |
+
@cross_origin()
|
84 |
+
def infer_vtt_indic_en():
|
85 |
+
start_time = time.time()
|
86 |
+
model, source_lang, target_lang = get_inference_params()
|
87 |
+
source_text = request.form['text']
|
88 |
+
# vad_segments = request.form['vad_nochunk'] # Assuming it is an array of start & end timestamps
|
89 |
+
|
90 |
+
vad = webvtt.read_buffer(StringIO(source_text))
|
91 |
+
source_sentences = [v.text.replace('\r', '').replace('\n', ' ') for v in vad]
|
92 |
+
|
93 |
+
## SUMANTH LOGIC HERE ##
|
94 |
+
|
95 |
+
# for each vad timestamp, do:
|
96 |
+
large_sentence = ' '.join(source_sentences) # only sentences in that time range
|
97 |
+
large_sentence = large_sentence.lower()
|
98 |
+
# split_sents = sentence_split(large_sentence, 'en')
|
99 |
+
# print(split_sents)
|
100 |
+
|
101 |
+
large_sentence = re.sub(r'[^\w\s]', '', large_sentence)
|
102 |
+
punctuated = rpunct.punctuate(large_sentence, batch_size=32)
|
103 |
+
end_time = time.time()
|
104 |
+
print("Time Taken for punctuation: {} s".format(end_time - start_time))
|
105 |
+
start_time = time.time()
|
106 |
+
split_sents = splitter([punctuated]) ### Please uncomment
|
107 |
+
|
108 |
+
|
109 |
+
# print(split_sents)
|
110 |
+
# output_sentence_punctuated = model.translate_paragraph(punctuated, source_lang, target_lang)
|
111 |
+
output_sents = model.batch_translate(split_sents, source_lang, target_lang)
|
112 |
+
# print(output_sents)
|
113 |
+
# output_sents = split_sents
|
114 |
+
# print(output_sents)
|
115 |
+
# align this to those range of source_sentences in `captions`
|
116 |
+
|
117 |
+
map_ = {split_sents[i] : output_sents[i] for i in range(len(split_sents))}
|
118 |
+
# print(map_)
|
119 |
+
punct_para = ' '.join(list(map_.keys()))
|
120 |
+
nmt_para = ' '.join(list(map_.values()))
|
121 |
+
nmt_words = nmt_para.split(' ')
|
122 |
+
|
123 |
+
len_punct = len(punct_para.split(' '))
|
124 |
+
len_nmt = len(nmt_para.split(' '))
|
125 |
+
|
126 |
+
start = 0
|
127 |
+
for i in range(len(vad)):
|
128 |
+
if vad[i].text == '':
|
129 |
+
continue
|
130 |
+
|
131 |
+
len_caption = len(vad[i].text.split(' '))
|
132 |
+
frac = (len_caption / len_punct)
|
133 |
+
# frac = round(frac, 2)
|
134 |
+
|
135 |
+
req_nmt_size = floor(frac * len_nmt)
|
136 |
+
# print(frac, req_nmt_size)
|
137 |
+
|
138 |
+
vad[i].text = ' '.join(nmt_words[start:start+req_nmt_size])
|
139 |
+
# print(vad[i].text)
|
140 |
+
# print(start, req_nmt_size)
|
141 |
+
start += req_nmt_size
|
142 |
+
|
143 |
+
end_time = time.time()
|
144 |
+
|
145 |
+
print("Time Taken for translation: {} s".format(end_time - start_time))
|
146 |
+
|
147 |
+
# vad.save('aligned.vtt')
|
148 |
+
|
149 |
+
return {
|
150 |
+
'text': vad.content,
|
151 |
+
# 'duration':round(end_time-start_time, 2)
|
152 |
+
}
|
api/punctuate.py
ADDED
@@ -0,0 +1,220 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
# 💾⚙️🔮
|
3 |
+
|
4 |
+
# taken from https://github.com/Felflare/rpunct/blob/master/rpunct/punctuate.py
|
5 |
+
# modified to support batching during gpu inference
|
6 |
+
|
7 |
+
|
8 |
+
__author__ = "Daulet N."
|
9 |
+
__email__ = "[email protected]"
|
10 |
+
|
11 |
+
import time
|
12 |
+
import logging
|
13 |
+
import webvtt
|
14 |
+
import torch
|
15 |
+
from io import StringIO
|
16 |
+
from nltk.tokenize import sent_tokenize
|
17 |
+
#from langdetect import detect
|
18 |
+
from simpletransformers.ner import NERModel
|
19 |
+
|
20 |
+
|
21 |
+
class RestorePuncts:
|
22 |
+
def __init__(self, wrds_per_pred=250):
|
23 |
+
self.wrds_per_pred = wrds_per_pred
|
24 |
+
self.overlap_wrds = 30
|
25 |
+
self.valid_labels = ['OU', 'OO', '.O', '!O', ',O', '.U', '!U', ',U', ':O', ';O', ':U', "'O", '-O', '?O', '?U']
|
26 |
+
self.model = NERModel("bert", "felflare/bert-restore-punctuation", labels=self.valid_labels,
|
27 |
+
args={"silent": True, "max_seq_length": 512})
|
28 |
+
# use_cuda isnt working and this hack seems to load the model correctly to the gpu
|
29 |
+
self.model.device = torch.device("cuda:1")
|
30 |
+
# dummy punctuate to load the model onto gpu
|
31 |
+
self.punctuate("hello how are you")
|
32 |
+
|
33 |
+
def punctuate(self, text: str, batch_size:int=32, lang:str=''):
|
34 |
+
"""
|
35 |
+
Performs punctuation restoration on arbitrarily large text.
|
36 |
+
Detects if input is not English, if non-English was detected terminates predictions.
|
37 |
+
Overrride by supplying `lang='en'`
|
38 |
+
|
39 |
+
Args:
|
40 |
+
- text (str): Text to punctuate, can be few words to as large as you want.
|
41 |
+
- lang (str): Explicit language of input text.
|
42 |
+
"""
|
43 |
+
#if not lang and len(text) > 10:
|
44 |
+
# lang = detect(text)
|
45 |
+
#if lang != 'en':
|
46 |
+
# raise Exception(F"""Non English text detected. Restore Punctuation works only for English.
|
47 |
+
# If you are certain the input is English, pass argument lang='en' to this function.
|
48 |
+
# Punctuate received: {text}""")
|
49 |
+
|
50 |
+
def chunks(L, n):
|
51 |
+
return [L[x : x + n] for x in range(0, len(L), n)]
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
# plit up large text into bert digestable chunks
|
56 |
+
splits = self.split_on_toks(text, self.wrds_per_pred, self.overlap_wrds)
|
57 |
+
|
58 |
+
texts = [i["text"] for i in splits]
|
59 |
+
batches = chunks(texts, batch_size)
|
60 |
+
preds_lst = []
|
61 |
+
|
62 |
+
|
63 |
+
for batch in batches:
|
64 |
+
batch_preds, _ = self.model.predict(batch)
|
65 |
+
preds_lst.extend(batch_preds)
|
66 |
+
|
67 |
+
|
68 |
+
# predict slices
|
69 |
+
# full_preds_lst contains tuple of labels and logits
|
70 |
+
#full_preds_lst = [self.predict(i['text']) for i in splits]
|
71 |
+
# extract predictions, and discard logits
|
72 |
+
#preds_lst = [i[0][0] for i in full_preds_lst]
|
73 |
+
# join text slices
|
74 |
+
combined_preds = self.combine_results(text, preds_lst)
|
75 |
+
# create punctuated prediction
|
76 |
+
punct_text = self.punctuate_texts(combined_preds)
|
77 |
+
return punct_text
|
78 |
+
|
79 |
+
def predict(self, input_slice):
|
80 |
+
"""
|
81 |
+
Passes the unpunctuated text to the model for punctuation.
|
82 |
+
"""
|
83 |
+
predictions, raw_outputs = self.model.predict([input_slice])
|
84 |
+
return predictions, raw_outputs
|
85 |
+
|
86 |
+
@staticmethod
|
87 |
+
def split_on_toks(text, length, overlap):
|
88 |
+
"""
|
89 |
+
Splits text into predefined slices of overlapping text with indexes (offsets)
|
90 |
+
that tie-back to original text.
|
91 |
+
This is done to bypass 512 token limit on transformer models by sequentially
|
92 |
+
feeding chunks of < 512 toks.
|
93 |
+
Example output:
|
94 |
+
[{...}, {"text": "...", 'start_idx': 31354, 'end_idx': 32648}, {...}]
|
95 |
+
"""
|
96 |
+
wrds = text.replace('\n', ' ').split(" ")
|
97 |
+
resp = []
|
98 |
+
lst_chunk_idx = 0
|
99 |
+
i = 0
|
100 |
+
|
101 |
+
while True:
|
102 |
+
# words in the chunk and the overlapping portion
|
103 |
+
wrds_len = wrds[(length * i):(length * (i + 1))]
|
104 |
+
wrds_ovlp = wrds[(length * (i + 1)):((length * (i + 1)) + overlap)]
|
105 |
+
wrds_split = wrds_len + wrds_ovlp
|
106 |
+
|
107 |
+
# Break loop if no more words
|
108 |
+
if not wrds_split:
|
109 |
+
break
|
110 |
+
|
111 |
+
wrds_str = " ".join(wrds_split)
|
112 |
+
nxt_chunk_start_idx = len(" ".join(wrds_len))
|
113 |
+
lst_char_idx = len(" ".join(wrds_split))
|
114 |
+
|
115 |
+
resp_obj = {
|
116 |
+
"text": wrds_str,
|
117 |
+
"start_idx": lst_chunk_idx,
|
118 |
+
"end_idx": lst_char_idx + lst_chunk_idx,
|
119 |
+
}
|
120 |
+
|
121 |
+
resp.append(resp_obj)
|
122 |
+
lst_chunk_idx += nxt_chunk_start_idx + 1
|
123 |
+
i += 1
|
124 |
+
logging.info(f"Sliced transcript into {len(resp)} slices.")
|
125 |
+
return resp
|
126 |
+
|
127 |
+
@staticmethod
|
128 |
+
def combine_results(full_text: str, text_slices):
|
129 |
+
"""
|
130 |
+
Given a full text and predictions of each slice combines predictions into a single text again.
|
131 |
+
Performs validataion wether text was combined correctly
|
132 |
+
"""
|
133 |
+
split_full_text = full_text.replace('\n', ' ').split(" ")
|
134 |
+
split_full_text = [i for i in split_full_text if i]
|
135 |
+
split_full_text_len = len(split_full_text)
|
136 |
+
output_text = []
|
137 |
+
index = 0
|
138 |
+
|
139 |
+
if len(text_slices[-1]) <= 3 and len(text_slices) > 1:
|
140 |
+
text_slices = text_slices[:-1]
|
141 |
+
|
142 |
+
for _slice in text_slices:
|
143 |
+
slice_wrds = len(_slice)
|
144 |
+
for ix, wrd in enumerate(_slice):
|
145 |
+
# print(index, "|", str(list(wrd.keys())[0]), "|", split_full_text[index])
|
146 |
+
if index == split_full_text_len:
|
147 |
+
break
|
148 |
+
|
149 |
+
if split_full_text[index] == str(list(wrd.keys())[0]) and \
|
150 |
+
ix <= slice_wrds - 3 and text_slices[-1] != _slice:
|
151 |
+
index += 1
|
152 |
+
pred_item_tuple = list(wrd.items())[0]
|
153 |
+
output_text.append(pred_item_tuple)
|
154 |
+
elif split_full_text[index] == str(list(wrd.keys())[0]) and text_slices[-1] == _slice:
|
155 |
+
index += 1
|
156 |
+
pred_item_tuple = list(wrd.items())[0]
|
157 |
+
output_text.append(pred_item_tuple)
|
158 |
+
assert [i[0] for i in output_text] == split_full_text
|
159 |
+
return output_text
|
160 |
+
|
161 |
+
@staticmethod
|
162 |
+
def punctuate_texts(full_pred: list):
|
163 |
+
"""
|
164 |
+
Given a list of Predictions from the model, applies the predictions to text,
|
165 |
+
thus punctuating it.
|
166 |
+
"""
|
167 |
+
punct_resp = ""
|
168 |
+
for i in full_pred:
|
169 |
+
word, label = i
|
170 |
+
if label[-1] == "U":
|
171 |
+
punct_wrd = word.capitalize()
|
172 |
+
else:
|
173 |
+
punct_wrd = word
|
174 |
+
|
175 |
+
if label[0] != "O":
|
176 |
+
punct_wrd += label[0]
|
177 |
+
|
178 |
+
punct_resp += punct_wrd + " "
|
179 |
+
punct_resp = punct_resp.strip()
|
180 |
+
# Append trailing period if doesnt exist.
|
181 |
+
if punct_resp[-1].isalnum():
|
182 |
+
punct_resp += "."
|
183 |
+
return punct_resp
|
184 |
+
|
185 |
+
|
186 |
+
if __name__ == "__main__":
|
187 |
+
|
188 |
+
start = time.time()
|
189 |
+
punct_model = RestorePuncts()
|
190 |
+
|
191 |
+
load_model = time.time()
|
192 |
+
print(f'Time to load model: {load_model - start}')
|
193 |
+
# read test file
|
194 |
+
# with open('en_lower.txt', 'r') as fp:
|
195 |
+
# # test_sample = fp.read()
|
196 |
+
# lines = fp.readlines()
|
197 |
+
|
198 |
+
with open('sample.vtt', 'r') as fp:
|
199 |
+
source_text = fp.read()
|
200 |
+
|
201 |
+
# captions = webvtt.read_buffer(StringIO(source_text))
|
202 |
+
captions = webvtt.read('sample.vtt')
|
203 |
+
source_sentences = [caption.text.replace('\r', '').replace('\n', ' ') for caption in captions]
|
204 |
+
|
205 |
+
# print(source_sentences)
|
206 |
+
|
207 |
+
sent = ' '.join(source_sentences)
|
208 |
+
punctuated = punct_model.punctuate(sent)
|
209 |
+
|
210 |
+
tokenised = sent_tokenize(punctuated)
|
211 |
+
# print(tokenised)
|
212 |
+
|
213 |
+
for i in range(len(tokenised)):
|
214 |
+
captions[i].text = tokenised[i]
|
215 |
+
# return captions.content
|
216 |
+
captions.save('my_captions.vtt')
|
217 |
+
|
218 |
+
end = time.time()
|
219 |
+
print(f'Time for run: {end - load_model}')
|
220 |
+
print(f'Total time: {end - start}')
|
app.py
CHANGED
@@ -1,7 +1,30 @@
|
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
-
|
4 |
-
|
|
|
5 |
|
6 |
-
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
import gradio as gr
|
3 |
|
4 |
+
download="wget --load-cookies /tmp/cookies.txt \"https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\\n/p')&id=1IpcnaQ2ScX_zodt2aLlXa_5Kkntl0nue\" -O en-indic.zip && rm -rf /tmp/cookies.txt"
|
5 |
+
os.system(download)
|
6 |
+
os.system('unzip /home/user/app/en-indic.zip')
|
7 |
|
8 |
+
from fairseq import checkpoint_utils, distributed_utils, options, tasks, utils
|
9 |
+
import gradio as gr
|
10 |
+
from inference.engine import Model
|
11 |
+
indic2en_model = Model(expdir='/home/user/app/en-indic')
|
12 |
+
|
13 |
+
INDIC = {"Assamese": "as", "Bengali": "bn", "Gujarati": "gu", "Hindi": "hi","Kannada": "kn","Malayalam": "ml", "Marathi": "mr", "Odia": "or","Punjabi": "pa","Tamil": "ta", "Telugu" : "te"}
|
14 |
+
|
15 |
+
|
16 |
+
def translate(text, lang):
|
17 |
+
return indic2en_model.translate_paragraph(text, 'en', INDIC[lang])
|
18 |
+
|
19 |
+
|
20 |
+
languages = list(INDIC.keys())
|
21 |
+
drop_down = gr.inputs.Dropdown(languages, type="value", default="Hindi", label="Select Target Language")
|
22 |
+
text = gr.inputs.Textbox(lines=5, placeholder="Enter Text to translate", default="", label="Enter Text in English")
|
23 |
+
text_ouptut = gr.outputs.Textbox(type="auto", label="Translated text in Target Language")
|
24 |
+
|
25 |
+
# example=[['I want to translate this sentence in Hindi','Hindi'],
|
26 |
+
# ['I am feeling very good today.', 'Bengali']]
|
27 |
+
|
28 |
+
supported_lang = ', '.join(languages)
|
29 |
+
iface = gr.Interface(fn=translate, inputs=[text,drop_down] , outputs=text_ouptut, title='IndicTrans NMT System', description = 'Currently the model supports ' + supported_lang, article = 'Original repository can be found [here](https://github.com/AI4Bharat/indicTrans)' , examples=None)
|
30 |
+
iface.launch(enable_queue=True)
|
inference/__init__.py
ADDED
File without changes
|
inference/custom_interactive.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# python wrapper for fairseq-interactive command line tool
|
2 |
+
|
3 |
+
#!/usr/bin/env python3 -u
|
4 |
+
# Copyright (c) Facebook, Inc. and its affiliates.
|
5 |
+
#
|
6 |
+
# This source code is licensed under the MIT license found in the
|
7 |
+
# LICENSE file in the root directory of this source tree.
|
8 |
+
"""
|
9 |
+
Translate raw text with a trained model. Batches data on-the-fly.
|
10 |
+
"""
|
11 |
+
|
12 |
+
import ast
|
13 |
+
from collections import namedtuple
|
14 |
+
|
15 |
+
import torch
|
16 |
+
from fairseq import checkpoint_utils, options, tasks, utils
|
17 |
+
from fairseq.dataclass.utils import convert_namespace_to_omegaconf
|
18 |
+
from fairseq.token_generation_constraints import pack_constraints, unpack_constraints
|
19 |
+
from fairseq_cli.generate import get_symbols_to_strip_from_output
|
20 |
+
|
21 |
+
import codecs
|
22 |
+
|
23 |
+
|
24 |
+
Batch = namedtuple("Batch", "ids src_tokens src_lengths constraints")
|
25 |
+
Translation = namedtuple("Translation", "src_str hypos pos_scores alignments")
|
26 |
+
|
27 |
+
|
28 |
+
def make_batches(
|
29 |
+
lines, cfg, task, max_positions, encode_fn, constrainted_decoding=False
|
30 |
+
):
|
31 |
+
def encode_fn_target(x):
|
32 |
+
return encode_fn(x)
|
33 |
+
|
34 |
+
if constrainted_decoding:
|
35 |
+
# Strip (tab-delimited) contraints, if present, from input lines,
|
36 |
+
# store them in batch_constraints
|
37 |
+
batch_constraints = [list() for _ in lines]
|
38 |
+
for i, line in enumerate(lines):
|
39 |
+
if "\t" in line:
|
40 |
+
lines[i], *batch_constraints[i] = line.split("\t")
|
41 |
+
|
42 |
+
# Convert each List[str] to List[Tensor]
|
43 |
+
for i, constraint_list in enumerate(batch_constraints):
|
44 |
+
batch_constraints[i] = [
|
45 |
+
task.target_dictionary.encode_line(
|
46 |
+
encode_fn_target(constraint),
|
47 |
+
append_eos=False,
|
48 |
+
add_if_not_exist=False,
|
49 |
+
)
|
50 |
+
for constraint in constraint_list
|
51 |
+
]
|
52 |
+
|
53 |
+
if constrainted_decoding:
|
54 |
+
constraints_tensor = pack_constraints(batch_constraints)
|
55 |
+
else:
|
56 |
+
constraints_tensor = None
|
57 |
+
|
58 |
+
tokens, lengths = task.get_interactive_tokens_and_lengths(lines, encode_fn)
|
59 |
+
|
60 |
+
itr = task.get_batch_iterator(
|
61 |
+
dataset=task.build_dataset_for_inference(
|
62 |
+
tokens, lengths, constraints=constraints_tensor
|
63 |
+
),
|
64 |
+
max_tokens=cfg.dataset.max_tokens,
|
65 |
+
max_sentences=cfg.dataset.batch_size,
|
66 |
+
max_positions=max_positions,
|
67 |
+
ignore_invalid_inputs=cfg.dataset.skip_invalid_size_inputs_valid_test,
|
68 |
+
).next_epoch_itr(shuffle=False)
|
69 |
+
for batch in itr:
|
70 |
+
ids = batch["id"]
|
71 |
+
src_tokens = batch["net_input"]["src_tokens"]
|
72 |
+
src_lengths = batch["net_input"]["src_lengths"]
|
73 |
+
constraints = batch.get("constraints", None)
|
74 |
+
|
75 |
+
yield Batch(
|
76 |
+
ids=ids,
|
77 |
+
src_tokens=src_tokens,
|
78 |
+
src_lengths=src_lengths,
|
79 |
+
constraints=constraints,
|
80 |
+
)
|
81 |
+
|
82 |
+
|
83 |
+
class Translator:
|
84 |
+
def __init__(
|
85 |
+
self, data_dir, checkpoint_path, batch_size=25, constrained_decoding=False
|
86 |
+
):
|
87 |
+
|
88 |
+
self.constrained_decoding = constrained_decoding
|
89 |
+
self.parser = options.get_generation_parser(interactive=True)
|
90 |
+
# buffer_size is currently not used but we just initialize it to batch
|
91 |
+
# size + 1 to avoid any assertion errors.
|
92 |
+
if self.constrained_decoding:
|
93 |
+
self.parser.set_defaults(
|
94 |
+
path=checkpoint_path,
|
95 |
+
remove_bpe="subword_nmt",
|
96 |
+
num_workers=-1,
|
97 |
+
constraints="ordered",
|
98 |
+
batch_size=batch_size,
|
99 |
+
buffer_size=batch_size + 1,
|
100 |
+
)
|
101 |
+
else:
|
102 |
+
self.parser.set_defaults(
|
103 |
+
path=checkpoint_path,
|
104 |
+
remove_bpe="subword_nmt",
|
105 |
+
num_workers=-1,
|
106 |
+
batch_size=batch_size,
|
107 |
+
buffer_size=batch_size + 1,
|
108 |
+
)
|
109 |
+
args = options.parse_args_and_arch(self.parser, input_args=[data_dir])
|
110 |
+
# we are explictly setting src_lang and tgt_lang here
|
111 |
+
# generally the data_dir we pass contains {split}-{src_lang}-{tgt_lang}.*.idx files from
|
112 |
+
# which fairseq infers the src and tgt langs(if these are not passed). In deployment we dont
|
113 |
+
# use any idx files and only store the SRC and TGT dictionaries.
|
114 |
+
args.source_lang = "SRC"
|
115 |
+
args.target_lang = "TGT"
|
116 |
+
# since we are truncating sentences to max_seq_len in engine, we can set it to False here
|
117 |
+
args.skip_invalid_size_inputs_valid_test = False
|
118 |
+
|
119 |
+
# we have custom architechtures in this folder and we will let fairseq
|
120 |
+
# import this
|
121 |
+
args.user_dir = "model_configs"
|
122 |
+
self.cfg = convert_namespace_to_omegaconf(args)
|
123 |
+
|
124 |
+
utils.import_user_module(self.cfg.common)
|
125 |
+
|
126 |
+
if self.cfg.interactive.buffer_size < 1:
|
127 |
+
self.cfg.interactive.buffer_size = 1
|
128 |
+
if self.cfg.dataset.max_tokens is None and self.cfg.dataset.batch_size is None:
|
129 |
+
self.cfg.dataset.batch_size = 1
|
130 |
+
|
131 |
+
assert (
|
132 |
+
not self.cfg.generation.sampling
|
133 |
+
or self.cfg.generation.nbest == self.cfg.generation.beam
|
134 |
+
), "--sampling requires --nbest to be equal to --beam"
|
135 |
+
assert (
|
136 |
+
not self.cfg.dataset.batch_size
|
137 |
+
or self.cfg.dataset.batch_size <= self.cfg.interactive.buffer_size
|
138 |
+
), "--batch-size cannot be larger than --buffer-size"
|
139 |
+
|
140 |
+
# Fix seed for stochastic decoding
|
141 |
+
# if self.cfg.common.seed is not None and not self.cfg.generation.no_seed_provided:
|
142 |
+
# np.random.seed(self.cfg.common.seed)
|
143 |
+
# utils.set_torch_seed(self.cfg.common.seed)
|
144 |
+
|
145 |
+
# if not self.constrained_decoding:
|
146 |
+
# self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu
|
147 |
+
# else:
|
148 |
+
# self.use_cuda = False
|
149 |
+
|
150 |
+
self.use_cuda = torch.cuda.is_available() and not self.cfg.common.cpu
|
151 |
+
|
152 |
+
# Setup task, e.g., translation
|
153 |
+
self.task = tasks.setup_task(self.cfg.task)
|
154 |
+
|
155 |
+
# Load ensemble
|
156 |
+
overrides = ast.literal_eval(self.cfg.common_eval.model_overrides)
|
157 |
+
self.models, self._model_args = checkpoint_utils.load_model_ensemble(
|
158 |
+
utils.split_paths(self.cfg.common_eval.path),
|
159 |
+
arg_overrides=overrides,
|
160 |
+
task=self.task,
|
161 |
+
suffix=self.cfg.checkpoint.checkpoint_suffix,
|
162 |
+
strict=(self.cfg.checkpoint.checkpoint_shard_count == 1),
|
163 |
+
num_shards=self.cfg.checkpoint.checkpoint_shard_count,
|
164 |
+
)
|
165 |
+
|
166 |
+
# Set dictionaries
|
167 |
+
self.src_dict = self.task.source_dictionary
|
168 |
+
self.tgt_dict = self.task.target_dictionary
|
169 |
+
|
170 |
+
# Optimize ensemble for generation
|
171 |
+
for model in self.models:
|
172 |
+
if model is None:
|
173 |
+
continue
|
174 |
+
if self.cfg.common.fp16:
|
175 |
+
model.half()
|
176 |
+
if (
|
177 |
+
self.use_cuda
|
178 |
+
and not self.cfg.distributed_training.pipeline_model_parallel
|
179 |
+
):
|
180 |
+
model.cuda()
|
181 |
+
model.prepare_for_inference_(self.cfg)
|
182 |
+
|
183 |
+
# Initialize generator
|
184 |
+
self.generator = self.task.build_generator(self.models, self.cfg.generation)
|
185 |
+
|
186 |
+
# Handle tokenization and BPE
|
187 |
+
self.tokenizer = self.task.build_tokenizer(self.cfg.tokenizer)
|
188 |
+
self.bpe = self.task.build_bpe(self.cfg.bpe)
|
189 |
+
|
190 |
+
# Load alignment dictionary for unknown word replacement
|
191 |
+
# (None if no unknown word replacement, empty if no path to align dictionary)
|
192 |
+
self.align_dict = utils.load_align_dict(self.cfg.generation.replace_unk)
|
193 |
+
|
194 |
+
self.max_positions = utils.resolve_max_positions(
|
195 |
+
self.task.max_positions(), *[model.max_positions() for model in self.models]
|
196 |
+
)
|
197 |
+
|
198 |
+
def encode_fn(self, x):
|
199 |
+
if self.tokenizer is not None:
|
200 |
+
x = self.tokenizer.encode(x)
|
201 |
+
if self.bpe is not None:
|
202 |
+
x = self.bpe.encode(x)
|
203 |
+
return x
|
204 |
+
|
205 |
+
def decode_fn(self, x):
|
206 |
+
if self.bpe is not None:
|
207 |
+
x = self.bpe.decode(x)
|
208 |
+
if self.tokenizer is not None:
|
209 |
+
x = self.tokenizer.decode(x)
|
210 |
+
return x
|
211 |
+
|
212 |
+
def translate(self, inputs, constraints=None):
|
213 |
+
if self.constrained_decoding and constraints is None:
|
214 |
+
raise ValueError("Constraints cant be None in constrained decoding mode")
|
215 |
+
if not self.constrained_decoding and constraints is not None:
|
216 |
+
raise ValueError("Cannot pass constraints during normal translation")
|
217 |
+
if constraints:
|
218 |
+
constrained_decoding = True
|
219 |
+
modified_inputs = []
|
220 |
+
for _input, constraint in zip(inputs, constraints):
|
221 |
+
modified_inputs.append(_input + f"\t{constraint}")
|
222 |
+
inputs = modified_inputs
|
223 |
+
else:
|
224 |
+
constrained_decoding = False
|
225 |
+
|
226 |
+
start_id = 0
|
227 |
+
results = []
|
228 |
+
final_translations = []
|
229 |
+
for batch in make_batches(
|
230 |
+
inputs,
|
231 |
+
self.cfg,
|
232 |
+
self.task,
|
233 |
+
self.max_positions,
|
234 |
+
self.encode_fn,
|
235 |
+
constrained_decoding,
|
236 |
+
):
|
237 |
+
bsz = batch.src_tokens.size(0)
|
238 |
+
src_tokens = batch.src_tokens
|
239 |
+
src_lengths = batch.src_lengths
|
240 |
+
constraints = batch.constraints
|
241 |
+
if self.use_cuda:
|
242 |
+
src_tokens = src_tokens.cuda()
|
243 |
+
src_lengths = src_lengths.cuda()
|
244 |
+
if constraints is not None:
|
245 |
+
constraints = constraints.cuda()
|
246 |
+
|
247 |
+
sample = {
|
248 |
+
"net_input": {
|
249 |
+
"src_tokens": src_tokens,
|
250 |
+
"src_lengths": src_lengths,
|
251 |
+
},
|
252 |
+
}
|
253 |
+
|
254 |
+
translations = self.task.inference_step(
|
255 |
+
self.generator, self.models, sample, constraints=constraints
|
256 |
+
)
|
257 |
+
|
258 |
+
list_constraints = [[] for _ in range(bsz)]
|
259 |
+
if constrained_decoding:
|
260 |
+
list_constraints = [unpack_constraints(c) for c in constraints]
|
261 |
+
for i, (id, hypos) in enumerate(zip(batch.ids.tolist(), translations)):
|
262 |
+
src_tokens_i = utils.strip_pad(src_tokens[i], self.tgt_dict.pad())
|
263 |
+
constraints = list_constraints[i]
|
264 |
+
results.append(
|
265 |
+
(
|
266 |
+
start_id + id,
|
267 |
+
src_tokens_i,
|
268 |
+
hypos,
|
269 |
+
{
|
270 |
+
"constraints": constraints,
|
271 |
+
},
|
272 |
+
)
|
273 |
+
)
|
274 |
+
|
275 |
+
# sort output to match input order
|
276 |
+
for id_, src_tokens, hypos, _ in sorted(results, key=lambda x: x[0]):
|
277 |
+
src_str = ""
|
278 |
+
if self.src_dict is not None:
|
279 |
+
src_str = self.src_dict.string(
|
280 |
+
src_tokens, self.cfg.common_eval.post_process
|
281 |
+
)
|
282 |
+
|
283 |
+
# Process top predictions
|
284 |
+
for hypo in hypos[: min(len(hypos), self.cfg.generation.nbest)]:
|
285 |
+
hypo_tokens, hypo_str, alignment = utils.post_process_prediction(
|
286 |
+
hypo_tokens=hypo["tokens"].int().cpu(),
|
287 |
+
src_str=src_str,
|
288 |
+
alignment=hypo["alignment"],
|
289 |
+
align_dict=self.align_dict,
|
290 |
+
tgt_dict=self.tgt_dict,
|
291 |
+
remove_bpe="subword_nmt",
|
292 |
+
extra_symbols_to_ignore=get_symbols_to_strip_from_output(
|
293 |
+
self.generator
|
294 |
+
),
|
295 |
+
)
|
296 |
+
detok_hypo_str = self.decode_fn(hypo_str)
|
297 |
+
final_translations.append(detok_hypo_str)
|
298 |
+
return final_translations
|
inference/engine.py
ADDED
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import truncate
|
2 |
+
from sacremoses import MosesPunctNormalizer
|
3 |
+
from sacremoses import MosesTokenizer
|
4 |
+
from sacremoses import MosesDetokenizer
|
5 |
+
from subword_nmt.apply_bpe import BPE, read_vocabulary
|
6 |
+
import codecs
|
7 |
+
from tqdm import tqdm
|
8 |
+
from indicnlp.tokenize import indic_tokenize
|
9 |
+
from indicnlp.tokenize import indic_detokenize
|
10 |
+
from indicnlp.normalize import indic_normalize
|
11 |
+
from indicnlp.transliterate import unicode_transliterate
|
12 |
+
from mosestokenizer import MosesSentenceSplitter
|
13 |
+
from indicnlp.tokenize import sentence_tokenize
|
14 |
+
|
15 |
+
from inference.custom_interactive import Translator
|
16 |
+
|
17 |
+
|
18 |
+
INDIC = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
|
19 |
+
|
20 |
+
|
21 |
+
def split_sentences(paragraph, language):
|
22 |
+
if language == "en":
|
23 |
+
with MosesSentenceSplitter(language) as splitter:
|
24 |
+
return splitter([paragraph])
|
25 |
+
elif language in INDIC:
|
26 |
+
return sentence_tokenize.sentence_split(paragraph, lang=language)
|
27 |
+
|
28 |
+
|
29 |
+
def add_token(sent, tag_infos):
|
30 |
+
"""add special tokens specified by tag_infos to each element in list
|
31 |
+
|
32 |
+
tag_infos: list of tuples (tag_type,tag)
|
33 |
+
|
34 |
+
each tag_info results in a token of the form: __{tag_type}__{tag}__
|
35 |
+
|
36 |
+
"""
|
37 |
+
|
38 |
+
tokens = []
|
39 |
+
for tag_type, tag in tag_infos:
|
40 |
+
token = "__" + tag_type + "__" + tag + "__"
|
41 |
+
tokens.append(token)
|
42 |
+
|
43 |
+
return " ".join(tokens) + " " + sent
|
44 |
+
|
45 |
+
|
46 |
+
def apply_lang_tags(sents, src_lang, tgt_lang):
|
47 |
+
tagged_sents = []
|
48 |
+
for sent in sents:
|
49 |
+
tagged_sent = add_token(sent.strip(), [("src", src_lang), ("tgt", tgt_lang)])
|
50 |
+
tagged_sents.append(tagged_sent)
|
51 |
+
return tagged_sents
|
52 |
+
|
53 |
+
|
54 |
+
def truncate_long_sentences(sents):
|
55 |
+
|
56 |
+
MAX_SEQ_LEN = 200
|
57 |
+
new_sents = []
|
58 |
+
|
59 |
+
for sent in sents:
|
60 |
+
words = sent.split()
|
61 |
+
num_words = len(words)
|
62 |
+
if num_words > MAX_SEQ_LEN:
|
63 |
+
print_str = " ".join(words[:5]) + " .... " + " ".join(words[-5:])
|
64 |
+
sent = " ".join(words[:MAX_SEQ_LEN])
|
65 |
+
print(
|
66 |
+
f"WARNING: Sentence {print_str} truncated to 200 tokens as it exceeds maximum length limit"
|
67 |
+
)
|
68 |
+
|
69 |
+
new_sents.append(sent)
|
70 |
+
return new_sents
|
71 |
+
|
72 |
+
|
73 |
+
class Model:
|
74 |
+
def __init__(self, expdir):
|
75 |
+
self.expdir = expdir
|
76 |
+
self.en_tok = MosesTokenizer(lang="en")
|
77 |
+
self.en_normalizer = MosesPunctNormalizer()
|
78 |
+
self.en_detok = MosesDetokenizer(lang="en")
|
79 |
+
self.xliterator = unicode_transliterate.UnicodeIndicTransliterator()
|
80 |
+
print("Initializing vocab and bpe")
|
81 |
+
self.vocabulary = read_vocabulary(
|
82 |
+
codecs.open(f"{expdir}/vocab/vocab.SRC", encoding="utf-8"), 5
|
83 |
+
)
|
84 |
+
self.bpe = BPE(
|
85 |
+
codecs.open(f"{expdir}/vocab/bpe_codes.32k.SRC", encoding="utf-8"),
|
86 |
+
-1,
|
87 |
+
"@@",
|
88 |
+
self.vocabulary,
|
89 |
+
None,
|
90 |
+
)
|
91 |
+
|
92 |
+
print("Initializing model for translation")
|
93 |
+
# initialize the model
|
94 |
+
self.translator = Translator(
|
95 |
+
f"{expdir}/final_bin", f"{expdir}/model/checkpoint_best.pt", batch_size=100
|
96 |
+
)
|
97 |
+
|
98 |
+
# translate a batch of sentences from src_lang to tgt_lang
|
99 |
+
def batch_translate(self, batch, src_lang, tgt_lang):
|
100 |
+
|
101 |
+
assert isinstance(batch, list)
|
102 |
+
preprocessed_sents = self.preprocess(batch, lang=src_lang)
|
103 |
+
bpe_sents = self.apply_bpe(preprocessed_sents)
|
104 |
+
tagged_sents = apply_lang_tags(bpe_sents, src_lang, tgt_lang)
|
105 |
+
tagged_sents = truncate_long_sentences(tagged_sents)
|
106 |
+
|
107 |
+
translations = self.translator.translate(tagged_sents)
|
108 |
+
postprocessed_sents = self.postprocess(translations, tgt_lang)
|
109 |
+
|
110 |
+
return postprocessed_sents
|
111 |
+
|
112 |
+
# translate a paragraph from src_lang to tgt_lang
|
113 |
+
def translate_paragraph(self, paragraph, src_lang, tgt_lang):
|
114 |
+
|
115 |
+
assert isinstance(paragraph, str)
|
116 |
+
sents = split_sentences(paragraph, src_lang)
|
117 |
+
|
118 |
+
postprocessed_sents = self.batch_translate(sents, src_lang, tgt_lang)
|
119 |
+
|
120 |
+
translated_paragraph = " ".join(postprocessed_sents)
|
121 |
+
|
122 |
+
return translated_paragraph
|
123 |
+
|
124 |
+
def preprocess_sent(self, sent, normalizer, lang):
|
125 |
+
if lang == "en":
|
126 |
+
return " ".join(
|
127 |
+
self.en_tok.tokenize(
|
128 |
+
self.en_normalizer.normalize(sent.strip()), escape=False
|
129 |
+
)
|
130 |
+
)
|
131 |
+
else:
|
132 |
+
# line = indic_detokenize.trivial_detokenize(line.strip(), lang)
|
133 |
+
return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
|
134 |
+
" ".join(
|
135 |
+
indic_tokenize.trivial_tokenize(
|
136 |
+
normalizer.normalize(sent.strip()), lang
|
137 |
+
)
|
138 |
+
),
|
139 |
+
lang,
|
140 |
+
"hi",
|
141 |
+
).replace(" ् ", "्")
|
142 |
+
|
143 |
+
def preprocess(self, sents, lang):
|
144 |
+
"""
|
145 |
+
Normalize, tokenize and script convert(for Indic)
|
146 |
+
return number of sentences input file
|
147 |
+
|
148 |
+
"""
|
149 |
+
|
150 |
+
if lang == "en":
|
151 |
+
|
152 |
+
# processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
|
153 |
+
# delayed(preprocess_line)(line, None, lang) for line in tqdm(sents, total=num_lines)
|
154 |
+
# )
|
155 |
+
processed_sents = [
|
156 |
+
self.preprocess_sent(line, None, lang) for line in tqdm(sents)
|
157 |
+
]
|
158 |
+
|
159 |
+
else:
|
160 |
+
normfactory = indic_normalize.IndicNormalizerFactory()
|
161 |
+
normalizer = normfactory.get_normalizer(lang)
|
162 |
+
|
163 |
+
# processed_sents = Parallel(n_jobs=-1, backend="multiprocessing")(
|
164 |
+
# delayed(preprocess_line)(line, normalizer, lang) for line in tqdm(infile, total=num_lines)
|
165 |
+
# )
|
166 |
+
processed_sents = [
|
167 |
+
self.preprocess_sent(line, normalizer, lang) for line in tqdm(sents)
|
168 |
+
]
|
169 |
+
|
170 |
+
return processed_sents
|
171 |
+
|
172 |
+
def postprocess(self, sents, lang, common_lang="hi"):
|
173 |
+
"""
|
174 |
+
parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.
|
175 |
+
|
176 |
+
infname: fairseq log file
|
177 |
+
outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
|
178 |
+
input_size: expected number of output sentences
|
179 |
+
lang: language
|
180 |
+
"""
|
181 |
+
postprocessed_sents = []
|
182 |
+
|
183 |
+
if lang == "en":
|
184 |
+
for sent in sents:
|
185 |
+
# outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
|
186 |
+
postprocessed_sents.append(self.en_detok.detokenize(sent.split(" ")))
|
187 |
+
else:
|
188 |
+
for sent in sents:
|
189 |
+
outstr = indic_detokenize.trivial_detokenize(
|
190 |
+
self.xliterator.transliterate(sent, common_lang, lang), lang
|
191 |
+
)
|
192 |
+
# outfile.write(outstr + "\n")
|
193 |
+
postprocessed_sents.append(outstr)
|
194 |
+
return postprocessed_sents
|
195 |
+
|
196 |
+
def apply_bpe(self, sents):
|
197 |
+
|
198 |
+
return [self.bpe.process_line(sent) for sent in sents]
|
legacy/apply_bpe_test_valid_notag.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
expdir=$1 # EXPDIR
|
4 |
+
org_data_dir=$2
|
5 |
+
langs=$3
|
6 |
+
|
7 |
+
#`dirname $0`/env.sh
|
8 |
+
SUBWORD_NMT_DIR="subword-nmt"
|
9 |
+
echo "Apply to each language"
|
10 |
+
|
11 |
+
for dset in `echo test dev`
|
12 |
+
do
|
13 |
+
echo $dset
|
14 |
+
|
15 |
+
in_dset_dir="$org_data_dir/$dset"
|
16 |
+
out_dset_dir="$expdir/bpe/$dset"
|
17 |
+
|
18 |
+
for lang in $langs
|
19 |
+
do
|
20 |
+
|
21 |
+
echo Apply BPE for $dset "-" $lang
|
22 |
+
|
23 |
+
mkdir -p $out_dset_dir
|
24 |
+
|
25 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
26 |
+
-c $expdir/vocab/bpe_codes.32k.SRC_TGT \
|
27 |
+
--vocabulary $expdir/vocab/vocab.SRC \
|
28 |
+
--vocabulary-threshold 5 \
|
29 |
+
< $in_dset_dir/$dset.$lang \
|
30 |
+
> $out_dset_dir/$dset.$lang
|
31 |
+
|
32 |
+
done
|
33 |
+
done
|
legacy/apply_bpe_train_notag.sh
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
expdir=$1 # EXPDIR
|
4 |
+
|
5 |
+
#`dirname $0`/env.sh
|
6 |
+
SUBWORD_NMT_DIR="subword-nmt"
|
7 |
+
|
8 |
+
data_dir="$expdir/data"
|
9 |
+
train_file=$data_dir/train
|
10 |
+
bpe_file=$expdir/bpe/train/train
|
11 |
+
|
12 |
+
mkdir -p $expdir/bpe/train
|
13 |
+
|
14 |
+
echo "Apply to SRC corpus"
|
15 |
+
|
16 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
17 |
+
-c $expdir/vocab/bpe_codes.32k.SRC_TGT \
|
18 |
+
--vocabulary $expdir/vocab/vocab.SRC \
|
19 |
+
--vocabulary-threshold 5 \
|
20 |
+
--num-workers "-1" \
|
21 |
+
< $train_file.SRC \
|
22 |
+
> $bpe_file.SRC
|
23 |
+
|
24 |
+
echo "Apply to TGT corpus"
|
25 |
+
|
26 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
27 |
+
-c $expdir/vocab/bpe_codes.32k.SRC_TGT \
|
28 |
+
--vocabulary $expdir/vocab/vocab.TGT \
|
29 |
+
--vocabulary-threshold 5 \
|
30 |
+
--num-workers "-1" \
|
31 |
+
< $train_file.TGT \
|
32 |
+
> $bpe_file.TGT
|
33 |
+
|
legacy/env.sh
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
export SRC=''
|
3 |
+
|
4 |
+
## Python env directory where fairseq is installed
|
5 |
+
export PYTHON_ENV=''
|
6 |
+
|
7 |
+
export SUBWORD_NMT_DIR=''
|
8 |
+
export INDIC_RESOURCES_PATH=''
|
9 |
+
export INDIC_NLP_HOME=''
|
10 |
+
|
11 |
+
export CUDA_HOME=''
|
12 |
+
|
13 |
+
export PATH=$CUDA_HOME/bin:$INDIC_NLP_HOME:$PATH
|
14 |
+
export LD_LIBRARY_PATH=$CUDA_HOME/lib64
|
15 |
+
|
16 |
+
# set environment variable to control GPUS visible to the application
|
17 |
+
#export CUDA_VISIBLE_DEVICES="'
|
legacy/indictrans_workflow.ipynb
ADDED
@@ -0,0 +1,643 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"import random\n",
|
11 |
+
"from tqdm.notebook import tqdm\n",
|
12 |
+
"from sacremoses import MosesPunctNormalizer\n",
|
13 |
+
"from sacremoses import MosesTokenizer\n",
|
14 |
+
"from sacremoses import MosesDetokenizer\n",
|
15 |
+
"from collections import defaultdict\n",
|
16 |
+
"import sacrebleu"
|
17 |
+
]
|
18 |
+
},
|
19 |
+
{
|
20 |
+
"cell_type": "code",
|
21 |
+
"execution_count": null,
|
22 |
+
"metadata": {},
|
23 |
+
"outputs": [],
|
24 |
+
"source": [
|
25 |
+
"# The path to the local git repo for Indic NLP library\n",
|
26 |
+
"INDIC_NLP_LIB_HOME=\"\"\n",
|
27 |
+
"\n",
|
28 |
+
"# The path to the local git repo for Indic NLP Resources\n",
|
29 |
+
"INDIC_NLP_RESOURCES=\"\"\n",
|
30 |
+
"\n",
|
31 |
+
"import sys\n",
|
32 |
+
"sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))\n",
|
33 |
+
"\n",
|
34 |
+
"from indicnlp import common\n",
|
35 |
+
"common.set_resources_path(INDIC_NLP_RESOURCES)\n",
|
36 |
+
"\n",
|
37 |
+
"from indicnlp import loader\n",
|
38 |
+
"loader.load()"
|
39 |
+
]
|
40 |
+
},
|
41 |
+
{
|
42 |
+
"cell_type": "code",
|
43 |
+
"execution_count": null,
|
44 |
+
"metadata": {},
|
45 |
+
"outputs": [],
|
46 |
+
"source": [
|
47 |
+
"import indicnlp\n",
|
48 |
+
"from indicnlp.tokenize import indic_tokenize\n",
|
49 |
+
"from indicnlp.tokenize import indic_detokenize\n",
|
50 |
+
"from indicnlp.normalize import indic_normalize\n",
|
51 |
+
"from indicnlp.transliterate import unicode_transliterate"
|
52 |
+
]
|
53 |
+
},
|
54 |
+
{
|
55 |
+
"cell_type": "code",
|
56 |
+
"execution_count": null,
|
57 |
+
"metadata": {},
|
58 |
+
"outputs": [],
|
59 |
+
"source": [
|
60 |
+
"LANGS=[\n",
|
61 |
+
" \"bn\",\n",
|
62 |
+
" \"gu\",\n",
|
63 |
+
" \"hi\",\n",
|
64 |
+
" \"kn\",\n",
|
65 |
+
" \"ml\",\n",
|
66 |
+
" \"mr\",\n",
|
67 |
+
" \"or\",\n",
|
68 |
+
" \"pa\",\n",
|
69 |
+
" \"ta\",\n",
|
70 |
+
" \"te\", \n",
|
71 |
+
"]"
|
72 |
+
]
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"cell_type": "code",
|
76 |
+
"execution_count": null,
|
77 |
+
"metadata": {},
|
78 |
+
"outputs": [],
|
79 |
+
"source": [
|
80 |
+
"def preprocess(infname,outfname,lang):\n",
|
81 |
+
" \"\"\"\n",
|
82 |
+
" Preparing each corpus file: \n",
|
83 |
+
" - Normalization\n",
|
84 |
+
" - Tokenization \n",
|
85 |
+
" - Script coversion to Devanagari for Indic scripts\n",
|
86 |
+
" \"\"\"\n",
|
87 |
+
" \n",
|
88 |
+
" ### reading \n",
|
89 |
+
" with open(infname,'r',encoding='utf-8') as infile, \\\n",
|
90 |
+
" open(outfname,'w',encoding='utf-8') as outfile:\n",
|
91 |
+
" \n",
|
92 |
+
" if lang=='en':\n",
|
93 |
+
" en_tok=MosesTokenizer(lang='en')\n",
|
94 |
+
" en_normalizer = MosesPunctNormalizer()\n",
|
95 |
+
" for line in tqdm(infile): \n",
|
96 |
+
" outline=' '.join(\n",
|
97 |
+
" en_tok.tokenize( \n",
|
98 |
+
" en_normalizer.normalize(line.strip()), \n",
|
99 |
+
" escape=False ) )\n",
|
100 |
+
" outfile.write(outline+'\\n')\n",
|
101 |
+
" \n",
|
102 |
+
" else:\n",
|
103 |
+
" normfactory=indic_normalize.IndicNormalizerFactory()\n",
|
104 |
+
" normalizer=normfactory.get_normalizer(lang)\n",
|
105 |
+
" for line in tqdm(infile): \n",
|
106 |
+
" outline=unicode_transliterate.UnicodeIndicTransliterator.transliterate(\n",
|
107 |
+
" ' '.join(\n",
|
108 |
+
" indic_tokenize.trivial_tokenize(\n",
|
109 |
+
" normalizer.normalize(line.strip()), lang) ), lang, 'hi').replace(' ् ','्')\n",
|
110 |
+
"\n",
|
111 |
+
"\n",
|
112 |
+
" outfile.write(outline+'\\n')"
|
113 |
+
]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"cell_type": "code",
|
117 |
+
"execution_count": null,
|
118 |
+
"metadata": {},
|
119 |
+
"outputs": [],
|
120 |
+
"source": [
|
121 |
+
"def add_token(sent, tag_infos):\n",
|
122 |
+
" \"\"\" add special tokens specified by tag_infos to each element in list\n",
|
123 |
+
"\n",
|
124 |
+
" tag_infos: list of tuples (tag_type,tag)\n",
|
125 |
+
"\n",
|
126 |
+
" each tag_info results in a token of the form: __{tag_type}__{tag}__\n",
|
127 |
+
"\n",
|
128 |
+
" \"\"\"\n",
|
129 |
+
"\n",
|
130 |
+
" tokens=[]\n",
|
131 |
+
" for tag_type, tag in tag_infos:\n",
|
132 |
+
" token = '__' + tag_type + '__' + tag + '__'\n",
|
133 |
+
" tokens.append(token)\n",
|
134 |
+
"\n",
|
135 |
+
" return ' '.join(tokens) + ' ' + sent \n",
|
136 |
+
"\n",
|
137 |
+
"\n",
|
138 |
+
"def concat_data(data_dir, outdir, lang_pair_list, out_src_lang='SRC', out_trg_lang='TGT'):\n",
|
139 |
+
" \"\"\"\n",
|
140 |
+
" data_dir: input dir, contains directories for language pairs named l1-l2\n",
|
141 |
+
" \"\"\"\n",
|
142 |
+
" os.makedirs(outdir,exist_ok=True)\n",
|
143 |
+
"\n",
|
144 |
+
" out_src_fname='{}/train.{}'.format(outdir,out_src_lang)\n",
|
145 |
+
" out_trg_fname='{}/train.{}'.format(outdir,out_trg_lang)\n",
|
146 |
+
"# out_meta_fname='{}/metadata.txt'.format(outdir)\n",
|
147 |
+
"\n",
|
148 |
+
" print()\n",
|
149 |
+
" print(out_src_fname)\n",
|
150 |
+
" print(out_trg_fname)\n",
|
151 |
+
"# print(out_meta_fname)\n",
|
152 |
+
"\n",
|
153 |
+
" ### concatenate train data \n",
|
154 |
+
" if os.path.isfile(out_src_fname):\n",
|
155 |
+
" os.unlink(out_src_fname)\n",
|
156 |
+
" if os.path.isfile(out_trg_fname):\n",
|
157 |
+
" os.unlink(out_trg_fname)\n",
|
158 |
+
"# if os.path.isfile(out_meta_fname):\n",
|
159 |
+
"# os.unlink(out_meta_fname)\n",
|
160 |
+
"\n",
|
161 |
+
" for src_lang, trg_lang in tqdm(lang_pair_list):\n",
|
162 |
+
" print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n",
|
163 |
+
"\n",
|
164 |
+
" in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n",
|
165 |
+
" in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n",
|
166 |
+
"\n",
|
167 |
+
" print(in_src_fname)\n",
|
168 |
+
" os.system('cat {} >> {}'.format(in_src_fname,out_src_fname))\n",
|
169 |
+
"\n",
|
170 |
+
" print(in_trg_fname)\n",
|
171 |
+
" os.system('cat {} >> {}'.format(in_trg_fname,out_trg_fname)) \n",
|
172 |
+
" \n",
|
173 |
+
" \n",
|
174 |
+
"# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n",
|
175 |
+
"# lpfile.write('\\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))\n",
|
176 |
+
" \n",
|
177 |
+
" corpus_stats(data_dir, outdir, lang_pair_list)\n",
|
178 |
+
" \n",
|
179 |
+
"def corpus_stats(data_dir, outdir, lang_pair_list):\n",
|
180 |
+
" \"\"\"\n",
|
181 |
+
" data_dir: input dir, contains directories for language pairs named l1-l2\n",
|
182 |
+
" \"\"\"\n",
|
183 |
+
"\n",
|
184 |
+
" with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile: \n",
|
185 |
+
"\n",
|
186 |
+
" for src_lang, trg_lang in tqdm(lang_pair_list):\n",
|
187 |
+
" print('src: {}, tgt:{}'.format(src_lang,trg_lang)) \n",
|
188 |
+
"\n",
|
189 |
+
" in_src_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,src_lang)\n",
|
190 |
+
" # in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)\n",
|
191 |
+
"\n",
|
192 |
+
" print(in_src_fname)\n",
|
193 |
+
" corpus_size=0\n",
|
194 |
+
" with open(in_src_fname,'r',encoding='utf-8') as infile:\n",
|
195 |
+
" corpus_size=sum(map(lambda x:1,infile))\n",
|
196 |
+
" \n",
|
197 |
+
" lpfile.write('{}\\t{}\\t{}\\n'.format(src_lang,trg_lang,corpus_size))\n",
|
198 |
+
" \n",
|
199 |
+
"def generate_lang_tag_iterator(infname):\n",
|
200 |
+
" with open(infname,'r',encoding='utf-8') as infile:\n",
|
201 |
+
" for line in infile:\n",
|
202 |
+
" src,tgt,count=line.strip().split('\\t')\n",
|
203 |
+
" count=int(count)\n",
|
204 |
+
" for _ in range(count):\n",
|
205 |
+
" yield (src,tgt) "
|
206 |
+
]
|
207 |
+
},
|
208 |
+
{
|
209 |
+
"cell_type": "code",
|
210 |
+
"execution_count": null,
|
211 |
+
"metadata": {},
|
212 |
+
"outputs": [],
|
213 |
+
"source": [
|
214 |
+
"#### directory containing all experiments \n",
|
215 |
+
"## one directory per experiment \n",
|
216 |
+
"EXPBASEDIR=''\n",
|
217 |
+
"\n",
|
218 |
+
"### directory containing data\n",
|
219 |
+
"## contains 3 directories: train test dev\n",
|
220 |
+
"## train directory structure: \n",
|
221 |
+
"## - There is one directory for each language pair\n",
|
222 |
+
"## - Directory naming convention lang1-lang2 (you need another directory/softlink for lang2-lang1)\n",
|
223 |
+
"## - Each directory contains 6 files: {train,test,dev}.{lang1,lang2}\n",
|
224 |
+
"## test & dev directory structure \n",
|
225 |
+
"## - test: contains files {test.l1,test.l2,test.l3} - assumes parallel test files like the wat2021 dataset\n",
|
226 |
+
"## - valid: contains files {dev.l1,dev.l2,dev.l3} - assumes parallel test files like the wat2021 dataset\n",
|
227 |
+
"## All files are tokenized\n",
|
228 |
+
"ORG_DATA_DIR='{d}/consolidated_unique_preprocessed'.format(d=BASEDIR)\n",
|
229 |
+
"\n"
|
230 |
+
]
|
231 |
+
},
|
232 |
+
{
|
233 |
+
"cell_type": "markdown",
|
234 |
+
"metadata": {},
|
235 |
+
"source": [
|
236 |
+
"# Exp2 (M2O)\n",
|
237 |
+
"\n",
|
238 |
+
"- All *-en "
|
239 |
+
]
|
240 |
+
},
|
241 |
+
{
|
242 |
+
"cell_type": "markdown",
|
243 |
+
"metadata": {},
|
244 |
+
"source": [
|
245 |
+
"**Params**"
|
246 |
+
]
|
247 |
+
},
|
248 |
+
{
|
249 |
+
"cell_type": "code",
|
250 |
+
"execution_count": null,
|
251 |
+
"metadata": {},
|
252 |
+
"outputs": [],
|
253 |
+
"source": [
|
254 |
+
"expname='exp2_m2o_baseline'\n",
|
255 |
+
"expdir='{}/{}'.format(EXPBASEDIR,expname)\n",
|
256 |
+
"\n",
|
257 |
+
"lang_pair_list=[]\n",
|
258 |
+
"for lang in LANGS: \n",
|
259 |
+
" lang_pair_list.append([lang,'en'])"
|
260 |
+
]
|
261 |
+
},
|
262 |
+
{
|
263 |
+
"cell_type": "markdown",
|
264 |
+
"metadata": {},
|
265 |
+
"source": [
|
266 |
+
"**Create Train Corpus**"
|
267 |
+
]
|
268 |
+
},
|
269 |
+
{
|
270 |
+
"cell_type": "code",
|
271 |
+
"execution_count": null,
|
272 |
+
"metadata": {},
|
273 |
+
"outputs": [],
|
274 |
+
"source": [
|
275 |
+
"indir='{}/train'.format(ORG_DATA_DIR)\n",
|
276 |
+
"outdir='{}/data'.format(expdir)\n",
|
277 |
+
"\n",
|
278 |
+
"# print(lang_pair_list)\n",
|
279 |
+
"concat_data(indir,outdir,lang_pair_list)"
|
280 |
+
]
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"cell_type": "markdown",
|
284 |
+
"metadata": {},
|
285 |
+
"source": [
|
286 |
+
"**Learn BPE**"
|
287 |
+
]
|
288 |
+
},
|
289 |
+
{
|
290 |
+
"cell_type": "code",
|
291 |
+
"execution_count": null,
|
292 |
+
"metadata": {},
|
293 |
+
"outputs": [],
|
294 |
+
"source": [
|
295 |
+
"!echo ./learn_bpe.sh {expdir}"
|
296 |
+
]
|
297 |
+
},
|
298 |
+
{
|
299 |
+
"cell_type": "code",
|
300 |
+
"execution_count": null,
|
301 |
+
"metadata": {},
|
302 |
+
"outputs": [],
|
303 |
+
"source": [
|
304 |
+
"!echo ./apply_bpe_train_notag.sh {expdir}"
|
305 |
+
]
|
306 |
+
},
|
307 |
+
{
|
308 |
+
"cell_type": "code",
|
309 |
+
"execution_count": null,
|
310 |
+
"metadata": {},
|
311 |
+
"outputs": [],
|
312 |
+
"source": [
|
313 |
+
"!echo ./apply_bpe_test_valid_notag.sh {expdir} {ORG_DATA_DIR} {'\"'+' '.join(LANGS+['en'])+'\"'}"
|
314 |
+
]
|
315 |
+
},
|
316 |
+
{
|
317 |
+
"cell_type": "markdown",
|
318 |
+
"metadata": {},
|
319 |
+
"source": [
|
320 |
+
"**Add language tags to train**"
|
321 |
+
]
|
322 |
+
},
|
323 |
+
{
|
324 |
+
"cell_type": "code",
|
325 |
+
"execution_count": null,
|
326 |
+
"metadata": {},
|
327 |
+
"outputs": [],
|
328 |
+
"source": [
|
329 |
+
"dset='train' \n",
|
330 |
+
"\n",
|
331 |
+
"src_fname='{expdir}/bpe/train/{dset}.SRC'.format(expdir=expdir,dset=dset)\n",
|
332 |
+
"tgt_fname='{expdir}/bpe/train/{dset}.TGT'.format(expdir=expdir,dset=dset)\n",
|
333 |
+
"meta_fname='{expdir}/data/lang_pairs.txt'.format(expdir=expdir,dset=dset)\n",
|
334 |
+
" \n",
|
335 |
+
"out_src_fname='{expdir}/final/{dset}.SRC'.format(expdir=expdir,dset=dset)\n",
|
336 |
+
"out_tgt_fname='{expdir}/final/{dset}.TGT'.format(expdir=expdir,dset=dset)\n",
|
337 |
+
"\n",
|
338 |
+
"lang_tag_iterator=generate_lang_tag_iterator(meta_fname)\n",
|
339 |
+
"\n",
|
340 |
+
"print(expdir)\n",
|
341 |
+
"os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
|
342 |
+
"\n",
|
343 |
+
"with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
|
344 |
+
" open(tgt_fname,'r',encoding='utf-8') as tgtfile, \\\n",
|
345 |
+
" open(out_src_fname,'w',encoding='utf-8') as outsrcfile, \\\n",
|
346 |
+
" open(out_tgt_fname,'w',encoding='utf-8') as outtgtfile: \n",
|
347 |
+
"\n",
|
348 |
+
" for (l1,l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator, srcfile, tgtfile)):\n",
|
349 |
+
" outsrcfile.write(add_token(src_sent.strip(),[('src',l1),('tgt',l2)]) + '\\n' )\n",
|
350 |
+
" outtgtfile.write(tgt_sent.strip()+'\\n')"
|
351 |
+
]
|
352 |
+
},
|
353 |
+
{
|
354 |
+
"cell_type": "markdown",
|
355 |
+
"metadata": {},
|
356 |
+
"source": [
|
357 |
+
"**Add language tags to valid**\n",
|
358 |
+
"\n",
|
359 |
+
"- add language tags, create parallel corpus\n",
|
360 |
+
"- sample 20\\% for validation set \n",
|
361 |
+
"- Create final validation set"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": null,
|
367 |
+
"metadata": {},
|
368 |
+
"outputs": [],
|
369 |
+
"source": [
|
370 |
+
"dset='dev' \n",
|
371 |
+
"out_src_fname='{expdir}/final/{dset}.SRC'.format(\n",
|
372 |
+
" expdir=expdir,dset=dset)\n",
|
373 |
+
"out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n",
|
374 |
+
" expdir=expdir,dset=dset)\n",
|
375 |
+
"\n",
|
376 |
+
"os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
|
377 |
+
"\n",
|
378 |
+
"print('Processing validation files') \n",
|
379 |
+
"consolidated_dset=[]\n",
|
380 |
+
"for l1, l2 in tqdm(lang_pair_list):\n",
|
381 |
+
" src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
|
382 |
+
" expdir=expdir,dset=dset,lang=l1)\n",
|
383 |
+
" tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
|
384 |
+
" expdir=expdir,dset=dset,lang=l2)\n",
|
385 |
+
"# print(src_fname)\n",
|
386 |
+
"# print(os.path.exists(src_fname))\n",
|
387 |
+
" with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
|
388 |
+
" open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n",
|
389 |
+
" for src_sent, tgt_sent in zip(srcfile,tgtfile):\n",
|
390 |
+
" consolidated_dset.append(\n",
|
391 |
+
" ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n",
|
392 |
+
" tgt_sent.strip() )\n",
|
393 |
+
" )\n",
|
394 |
+
"\n",
|
395 |
+
"print('Create validation set') \n",
|
396 |
+
"random.shuffle(consolidated_dset)\n",
|
397 |
+
"final_set=consolidated_dset[:len(consolidated_dset)//5] \n",
|
398 |
+
"\n",
|
399 |
+
"print('Original set size: {}'.format(len(consolidated_dset))) \n",
|
400 |
+
"print('Sampled set size: {}'.format(len(final_set))) \n",
|
401 |
+
"\n",
|
402 |
+
"print('Write validation set')\n",
|
403 |
+
"\n",
|
404 |
+
"with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n",
|
405 |
+
" open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n",
|
406 |
+
" for src_sent, tgt_sent in final_set: \n",
|
407 |
+
" srcfile.write(src_sent+'\\n')\n",
|
408 |
+
" tgtfile.write(tgt_sent+'\\n')\n"
|
409 |
+
]
|
410 |
+
},
|
411 |
+
{
|
412 |
+
"cell_type": "markdown",
|
413 |
+
"metadata": {},
|
414 |
+
"source": [
|
415 |
+
"**Add language tags to test**\n",
|
416 |
+
"\n",
|
417 |
+
"- add language tags, create parallel corpus all M2O language pairs \n",
|
418 |
+
"- Create final test set"
|
419 |
+
]
|
420 |
+
},
|
421 |
+
{
|
422 |
+
"cell_type": "code",
|
423 |
+
"execution_count": null,
|
424 |
+
"metadata": {},
|
425 |
+
"outputs": [],
|
426 |
+
"source": [
|
427 |
+
"dset='test' \n",
|
428 |
+
"out_src_fname='{expdir}/final/{dset}.SRC'.format(\n",
|
429 |
+
" expdir=expdir,dset=dset)\n",
|
430 |
+
"out_tgt_fname='{expdir}/final/{dset}.TGT'.format(\n",
|
431 |
+
" expdir=expdir,dset=dset)\n",
|
432 |
+
"\n",
|
433 |
+
"os.makedirs('{expdir}/final'.format(expdir=expdir),exist_ok=True)\n",
|
434 |
+
"\n",
|
435 |
+
"print('Processing test files') \n",
|
436 |
+
"consolidated_dset=[]\n",
|
437 |
+
"for l1, l2 in tqdm(lang_pair_list):\n",
|
438 |
+
" src_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
|
439 |
+
" expdir=expdir,dset=dset,lang=l1)\n",
|
440 |
+
" tgt_fname='{expdir}/bpe/{dset}/{dset}.{lang}'.format(\n",
|
441 |
+
" expdir=expdir,dset=dset,lang=l2)\n",
|
442 |
+
"# print(src_fname)\n",
|
443 |
+
"# print(os.path.exists(src_fname))\n",
|
444 |
+
" with open(src_fname,'r',encoding='utf-8') as srcfile, \\\n",
|
445 |
+
" open(tgt_fname,'r',encoding='utf-8') as tgtfile:\n",
|
446 |
+
" for src_sent, tgt_sent in zip(srcfile,tgtfile):\n",
|
447 |
+
" consolidated_dset.append(\n",
|
448 |
+
" ( add_token(src_sent.strip(),[('src',l1),('tgt',l2)]),\n",
|
449 |
+
" tgt_sent.strip() )\n",
|
450 |
+
" )\n",
|
451 |
+
"\n",
|
452 |
+
"print('Final set size: {}'.format(len(consolidated_dset))) \n",
|
453 |
+
" \n",
|
454 |
+
"print('Write test set')\n",
|
455 |
+
"print('testset truncated')\n",
|
456 |
+
"\n",
|
457 |
+
"with open(out_src_fname,'w',encoding='utf-8') as srcfile, \\\n",
|
458 |
+
" open(out_tgt_fname,'w',encoding='utf-8') as tgtfile:\n",
|
459 |
+
" for lno, (src_sent, tgt_sent) in enumerate(consolidated_dset,1):\n",
|
460 |
+
" \n",
|
461 |
+
" s=src_sent.strip().split(' ')\n",
|
462 |
+
" t=tgt_sent.strip().split(' ')\n",
|
463 |
+
" \n",
|
464 |
+
" if len(s) > 200 or len(t) > 200:\n",
|
465 |
+
" print('exp: {}, pair: ({},{}), lno: {}: lens: ({},{})'.format(expname,l1,l2,lno,len(s),len(t))) \n",
|
466 |
+
" \n",
|
467 |
+
" src_sent=' '.join( s[:min(len(s),200)] )\n",
|
468 |
+
" tgt_sent=' '.join( t[:min(len(t),200)] )\n",
|
469 |
+
" \n",
|
470 |
+
" srcfile.write(src_sent+'\\n')\n",
|
471 |
+
" tgtfile.write(tgt_sent+'\\n')"
|
472 |
+
]
|
473 |
+
},
|
474 |
+
{
|
475 |
+
"cell_type": "markdown",
|
476 |
+
"metadata": {},
|
477 |
+
"source": [
|
478 |
+
"**Binarize data**"
|
479 |
+
]
|
480 |
+
},
|
481 |
+
{
|
482 |
+
"cell_type": "code",
|
483 |
+
"execution_count": null,
|
484 |
+
"metadata": {},
|
485 |
+
"outputs": [],
|
486 |
+
"source": [
|
487 |
+
"!echo ./binarize_training_exp.sh {expdir} SRC TGT"
|
488 |
+
]
|
489 |
+
},
|
490 |
+
{
|
491 |
+
"cell_type": "markdown",
|
492 |
+
"metadata": {},
|
493 |
+
"source": [
|
494 |
+
"**Training Command**"
|
495 |
+
]
|
496 |
+
},
|
497 |
+
{
|
498 |
+
"cell_type": "code",
|
499 |
+
"execution_count": null,
|
500 |
+
"metadata": {},
|
501 |
+
"outputs": [],
|
502 |
+
"source": [
|
503 |
+
"%%bash \n",
|
504 |
+
"\n",
|
505 |
+
"python train.py {expdir}/final_bin \\\n",
|
506 |
+
" --arch transformer \\\n",
|
507 |
+
" --optimizer adam --adam-betas '(0.9, 0.98)' --clip-norm 1.0 \\\n",
|
508 |
+
" --lr 0.0005 --lr-scheduler inverse_sqrt --warmup-updates 4000 --warmup-init-lr 1e-07 \\\n",
|
509 |
+
" --dropout 0.2 \\\n",
|
510 |
+
" --criterion label_smoothed_cross_entropy --label-smoothing 0.1 \\\n",
|
511 |
+
" --max-tokens 8192 \\\n",
|
512 |
+
" --max-update 1000000 \\\n",
|
513 |
+
" --max-source-positions 200 \\\n",
|
514 |
+
" --max-target-positions 200 \\\n",
|
515 |
+
" --tensorboard-logdir {expdir}/tensorboard \\\n",
|
516 |
+
" --save-dir {expdir}/model \\\n",
|
517 |
+
" --required-batch-size-multiple 8 \\\n",
|
518 |
+
" --save-interval 1 \\\n",
|
519 |
+
" --keep-last-epochs 5 \\\n",
|
520 |
+
" --patience 5 \\\n",
|
521 |
+
" --fp16"
|
522 |
+
]
|
523 |
+
},
|
524 |
+
{
|
525 |
+
"cell_type": "markdown",
|
526 |
+
"metadata": {},
|
527 |
+
"source": [
|
528 |
+
"**Cleanup**"
|
529 |
+
]
|
530 |
+
},
|
531 |
+
{
|
532 |
+
"cell_type": "code",
|
533 |
+
"execution_count": null,
|
534 |
+
"metadata": {},
|
535 |
+
"outputs": [],
|
536 |
+
"source": [
|
537 |
+
"# os.unlink('{}')\n",
|
538 |
+
"\n",
|
539 |
+
"to_delete=[\n",
|
540 |
+
" '{expdir}/data/train.SRC'.format(expdir=expdir,dset=dset),\n",
|
541 |
+
" '{expdir}/data/train.TGT'.format(expdir=expdir,dset=dset),\n",
|
542 |
+
" '{expdir}/bpe/train/train.SRC'.format(expdir=expdir,dset=dset),\n",
|
543 |
+
" '{expdir}/bpe/train/train.TGT'.format(expdir=expdir,dset=dset),\n",
|
544 |
+
"]`\n",
|
545 |
+
"\n",
|
546 |
+
"for fname in to_delete:\n",
|
547 |
+
" os.unlink(fname)"
|
548 |
+
]
|
549 |
+
},
|
550 |
+
{
|
551 |
+
"cell_type": "markdown",
|
552 |
+
"metadata": {},
|
553 |
+
"source": [
|
554 |
+
"**Evaluation**"
|
555 |
+
]
|
556 |
+
},
|
557 |
+
{
|
558 |
+
"cell_type": "code",
|
559 |
+
"execution_count": null,
|
560 |
+
"metadata": {},
|
561 |
+
"outputs": [],
|
562 |
+
"source": [
|
563 |
+
"dset='test' \n",
|
564 |
+
"consolidated_testoutput_fname='{expdir}/evaluations/test/default/test.SRC_TGT.TGT'.format(expdir=expdir)\n",
|
565 |
+
"consolidated_testoutput_log_fname='{}.log'.format(consolidated_testoutput_fname)\n",
|
566 |
+
"metrics_fname='{expdir}/evaluations/test/default/test.metrics.tsv'.format(expdir=expdir)\n",
|
567 |
+
" \n",
|
568 |
+
"test_set_size=2390\n",
|
569 |
+
"\n",
|
570 |
+
"consolidated_testoutput=[]\n",
|
571 |
+
"with open(consolidated_testoutput_log_fname,'r',encoding='utf-8') as hypfile:\n",
|
572 |
+
" consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),hypfile) ))\n",
|
573 |
+
" consolidated_testoutput.sort(key=lambda x: int(x.split('\\t')[0].split('-')[1]))\n",
|
574 |
+
" consolidated_testoutput=[ x.split('\\t')[2] for x in consolidated_testoutput ]\n",
|
575 |
+
"\n",
|
576 |
+
"os.makedirs('{expdir}/evaluations/test/default'.format(expdir=expdir),exist_ok=True)\n",
|
577 |
+
"\n",
|
578 |
+
"with open(consolidated_testoutput_fname,'w',encoding='utf-8') as finalhypfile:\n",
|
579 |
+
" for sent in consolidated_testoutput:\n",
|
580 |
+
" finalhypfile.write(sent+'\\n')\n",
|
581 |
+
"\n",
|
582 |
+
"print('Processing test files') \n",
|
583 |
+
"with open(metrics_fname,'w',encoding='utf-8') as metrics_file: \n",
|
584 |
+
" for i, (l1, l2) in enumerate(tqdm(lang_pair_list)):\n",
|
585 |
+
"\n",
|
586 |
+
" start=i*test_set_size\n",
|
587 |
+
" end=(i+1)*test_set_size\n",
|
588 |
+
" hyps=consolidated_testoutput[start:end]\n",
|
589 |
+
" ref_fname='{expdir}/{dset}/{dset}.{lang}'.format(\n",
|
590 |
+
" expdir=ORG_DATA_DIR,dset=dset,lang=l2)\n",
|
591 |
+
"\n",
|
592 |
+
" refs=[]\n",
|
593 |
+
" with open(ref_fname,'r',encoding='utf-8') as reffile:\n",
|
594 |
+
" refs.extend(map(lambda x:x.strip(),reffile))\n",
|
595 |
+
"\n",
|
596 |
+
" assert(len(hyps)==len(refs))\n",
|
597 |
+
"\n",
|
598 |
+
" bleu=sacrebleu.corpus_bleu(hyps,[refs],tokenize='none')\n",
|
599 |
+
"\n",
|
600 |
+
" print('{} {} {} {}'.format(l1,l2,bleu.score,bleu.prec_str))\n",
|
601 |
+
" metrics_file.write('{}\\t{}\\t{}\\t{}\\t{}\\n'.format(expname,l1,l2,bleu.score,bleu.prec_str))\n",
|
602 |
+
" "
|
603 |
+
]
|
604 |
+
}
|
605 |
+
],
|
606 |
+
"metadata": {
|
607 |
+
"kernelspec": {
|
608 |
+
"display_name": "Python 3",
|
609 |
+
"language": "python",
|
610 |
+
"name": "python3"
|
611 |
+
},
|
612 |
+
"language_info": {
|
613 |
+
"codemirror_mode": {
|
614 |
+
"name": "ipython",
|
615 |
+
"version": 3
|
616 |
+
},
|
617 |
+
"file_extension": ".py",
|
618 |
+
"mimetype": "text/x-python",
|
619 |
+
"name": "python",
|
620 |
+
"nbconvert_exporter": "python",
|
621 |
+
"pygments_lexer": "ipython3",
|
622 |
+
"version": "3.7.0"
|
623 |
+
},
|
624 |
+
"toc": {
|
625 |
+
"base_numbering": 1,
|
626 |
+
"nav_menu": {
|
627 |
+
"height": "243.993px",
|
628 |
+
"width": "160px"
|
629 |
+
},
|
630 |
+
"number_sections": true,
|
631 |
+
"sideBar": true,
|
632 |
+
"skip_h1_title": false,
|
633 |
+
"title_cell": "Table of Contents",
|
634 |
+
"title_sidebar": "Contents",
|
635 |
+
"toc_cell": false,
|
636 |
+
"toc_position": {},
|
637 |
+
"toc_section_display": true,
|
638 |
+
"toc_window_display": false
|
639 |
+
}
|
640 |
+
},
|
641 |
+
"nbformat": 4,
|
642 |
+
"nbformat_minor": 4
|
643 |
+
}
|
legacy/install_fairseq.sh
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#NVIDIA CUDA download
|
2 |
+
wget "https://developer.nvidia.com/compute/cuda/10.0/Prod/local_installers/cuda_10.0.130_410.48_linux"
|
3 |
+
wget "http://developer.download.nvidia.com/compute/cuda/10.0/Prod/patches/1/cuda_10.0.130.1_linux.run"
|
4 |
+
|
5 |
+
## do not install drivers (See this: https://docs.nvidia.com/deploy/cuda-compatibility/index.html)
|
6 |
+
sudo sh "cuda_10.0.130_410.48_linux"
|
7 |
+
sudo sh "cuda_10.0.130.1_linux.run"
|
8 |
+
|
9 |
+
#Set environment variables
|
10 |
+
export CUDA_HOME=/usr/local/cuda-10.0
|
11 |
+
export PATH=$CUDA_HOME/bin:$PATH
|
12 |
+
export LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH
|
13 |
+
|
14 |
+
# Install pytorch 1.2
|
15 |
+
python3 -m venv pytorch1.2
|
16 |
+
source pytorch1.2/bin/activate
|
17 |
+
which pip3
|
18 |
+
pip3 install torch==1.2.0 torchvision==0.4.0
|
19 |
+
|
20 |
+
# Install nccl
|
21 |
+
git clone https://github.com/NVIDIA/nccl.git
|
22 |
+
cd nccl
|
23 |
+
make src.build CUDA_HOME=$CUDA_HOME
|
24 |
+
sudo apt install build-essential devscripts debhelper fakeroot
|
25 |
+
make pkg.debian.build CUDA_HOME=$CUDA_HOME
|
26 |
+
sudo dpkg -i build/pkg/deb/libnccl2_2.7.8-1+cuda10.0_amd64.deb
|
27 |
+
sudo dpkg -i build/pkg/deb/libnccl-dev_2.7.8-1+cuda10.0_amd64.deb
|
28 |
+
sudo apt-get install -f
|
29 |
+
|
30 |
+
# Install Apex
|
31 |
+
git clone https://github.com/NVIDIA/apex
|
32 |
+
cd apex
|
33 |
+
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" \
|
34 |
+
--global-option="--deprecated_fused_adam" --global-option="--xentropy" \
|
35 |
+
--global-option="--fast_multihead_attn" ./
|
36 |
+
|
37 |
+
# Install PyArrow
|
38 |
+
pip install pyarrow
|
39 |
+
|
40 |
+
# Install fairseq
|
41 |
+
pip install --editable ./
|
42 |
+
|
43 |
+
# Install other dependencies
|
44 |
+
pip install sacrebleu
|
45 |
+
pip install tensorboardX --user
|
legacy/run_inference.sh
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
src_lang=${1:-hi}
|
2 |
+
tgt_lang=${2:-en}
|
3 |
+
bucket_path=${3:-gs://ai4b-anuvaad-nmt/baselines/transformer-base/baselines-${src_lang}-${tgt_lang}}
|
4 |
+
|
5 |
+
expdir=../baselines/baselines-${src_lang}-${tgt_lang}
|
6 |
+
|
7 |
+
if [[ -d $expdir ]]
|
8 |
+
then
|
9 |
+
echo "$expdir exists on your filesystem. Please delete this if you have made some changes to the bucket files and trying to redownload"
|
10 |
+
else
|
11 |
+
mkdir -p $expdir
|
12 |
+
mkdir -p $expdir/model
|
13 |
+
cd ../baselines
|
14 |
+
gsutil -m cp -r $bucket_path/vocab $expdir
|
15 |
+
gsutil -m cp -r $bucket_path/final_bin $expdir
|
16 |
+
gsutil -m cp $bucket_path/model/checkpoint_best.pt $expdir/model
|
17 |
+
cd ../indicTrans
|
18 |
+
fi
|
19 |
+
|
20 |
+
|
21 |
+
if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then
|
22 |
+
#TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all)
|
23 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news )
|
24 |
+
elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then
|
25 |
+
# TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
|
26 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018 wmt-news ufal-ta)
|
27 |
+
elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then
|
28 |
+
# TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
|
29 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
|
30 |
+
elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then
|
31 |
+
# TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all)
|
32 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest wmt-news )
|
33 |
+
elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then
|
34 |
+
TEST_SETS=( pmi )
|
35 |
+
elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then
|
36 |
+
# TEST_SETS=( wat2021-devtest anuvaad-legal all)
|
37 |
+
TEST_SETS=( wat2021-devtest )
|
38 |
+
elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then
|
39 |
+
# TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all)
|
40 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
|
41 |
+
elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then
|
42 |
+
# TEST_SETS=( wat2021-devtest wat2020-devtest all)
|
43 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest )
|
44 |
+
elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then
|
45 |
+
TEST_SETS=( wat2021-devtest )
|
46 |
+
elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then
|
47 |
+
TEST_SETS=( wat2021-devtest )
|
48 |
+
elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then
|
49 |
+
# TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all )
|
50 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest wat-2018)
|
51 |
+
fi
|
52 |
+
|
53 |
+
if [ $src_lang == 'en' ]; then
|
54 |
+
indic_lang=$tgt_lang
|
55 |
+
else
|
56 |
+
indic_lang=$src_lang
|
57 |
+
fi
|
58 |
+
|
59 |
+
|
60 |
+
for tset in ${TEST_SETS[@]};do
|
61 |
+
echo $tset $src_lang $tgt_lang
|
62 |
+
if [ $tset == 'wat2021-devtest' ]; then
|
63 |
+
SRC_FILE=${expdir}/benchmarks/$tset/test.$src_lang
|
64 |
+
REF_FILE=${expdir}/benchmarks/$tset/test.$tgt_lang
|
65 |
+
else
|
66 |
+
SRC_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$src_lang
|
67 |
+
REF_FILE=${expdir}/benchmarks/$tset/en-${indic_lang}/test.$tgt_lang
|
68 |
+
fi
|
69 |
+
RESULTS_DIR=${expdir}/results/$tset
|
70 |
+
|
71 |
+
mkdir -p $RESULTS_DIR
|
72 |
+
|
73 |
+
bash translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE
|
74 |
+
# for newline between different outputs
|
75 |
+
echo
|
76 |
+
done
|
77 |
+
# send the results to the bucket
|
78 |
+
gsutil -m cp -r $expdir/results $bucket_path
|
79 |
+
# clear up the space in the instance
|
80 |
+
# rm -r $expdir
|
legacy/run_joint_inference.sh
ADDED
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
src_lang=${1:-en}
|
2 |
+
tgt_lang=${2:-indic}
|
3 |
+
bucket_path=${3:-gs://ai4b-anuvaad-nmt/models/transformer-4x/indictrans-${src_lang}-${tgt_lang}}
|
4 |
+
|
5 |
+
mkdir -p ../baselines
|
6 |
+
expdir=../baselines/baselines-${src_lang}-${tgt_lang}
|
7 |
+
|
8 |
+
if [[ -d $expdir ]]
|
9 |
+
then
|
10 |
+
echo "$expdir exists on your filesystem."
|
11 |
+
else
|
12 |
+
cd ../baselines
|
13 |
+
mkdir -p baselines-${src_lang}-${tgt_lang}/model
|
14 |
+
mkdir -p baselines-${src_lang}-${tgt_lang}/final_bin
|
15 |
+
cd baselines-${src_lang}-${tgt_lang}/model
|
16 |
+
gsutil -m cp $bucket_path/model/checkpoint_best.pt .
|
17 |
+
cd ..
|
18 |
+
gsutil -m cp $bucket_path/vocab .
|
19 |
+
gsutil -m cp $bucket_path/final_bin/dict.* final_bin
|
20 |
+
cd ../indicTrans
|
21 |
+
fi
|
22 |
+
|
23 |
+
|
24 |
+
|
25 |
+
|
26 |
+
|
27 |
+
if [ $src_lang == 'hi' ] || [ $tgt_lang == 'hi' ]; then
|
28 |
+
TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 sap-documentation-benchmark all)
|
29 |
+
elif [ $src_lang == 'ta' ] || [ $tgt_lang == 'ta' ]; then
|
30 |
+
TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
|
31 |
+
elif [ $src_lang == 'bn' ] || [ $tgt_lang == 'bn' ]; then
|
32 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal tico19 all)
|
33 |
+
elif [ $src_lang == 'gu' ] || [ $tgt_lang == 'gu' ]; then
|
34 |
+
TEST_SETS=( wmt-news wat2021-devtest wat2020-devtest all)
|
35 |
+
elif [ $src_lang == 'as' ] || [ $tgt_lang == 'as' ]; then
|
36 |
+
TEST_SETS=( all )
|
37 |
+
elif [ $src_lang == 'kn' ] || [ $tgt_lang == 'kn' ]; then
|
38 |
+
TEST_SETS=( wat2021-devtest anuvaad-legal all)
|
39 |
+
elif [ $src_lang == 'ml' ] || [ $tgt_lang == 'ml' ]; then
|
40 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all)
|
41 |
+
elif [ $src_lang == 'mr' ] || [ $tgt_lang == 'mr' ]; then
|
42 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest all)
|
43 |
+
elif [ $src_lang == 'or' ] || [ $tgt_lang == 'or' ]; then
|
44 |
+
TEST_SETS=( all )
|
45 |
+
elif [ $src_lang == 'pa' ] || [ $tgt_lang == 'pa' ]; then
|
46 |
+
TEST_SETS=( all )
|
47 |
+
elif [ $src_lang == 'te' ] || [ $tgt_lang == 'te' ]; then
|
48 |
+
TEST_SETS=( wat2021-devtest wat2020-devtest anuvaad-legal all )
|
49 |
+
fi
|
50 |
+
|
51 |
+
if [ $src_lang == 'en' ]; then
|
52 |
+
indic_lang=$tgt_lang
|
53 |
+
else
|
54 |
+
indic_lang=$src_lang
|
55 |
+
fi
|
56 |
+
|
57 |
+
|
58 |
+
for tset in ${TEST_SETS[@]};do
|
59 |
+
echo $tset $src_lang $tgt_lang
|
60 |
+
if [ $tset == 'wat2021-devtest' ]; then
|
61 |
+
SRC_FILE=${expdir}/devtest/$tset/test.$src_lang
|
62 |
+
REF_FILE=${expdir}/devtest/$tset/test.$tgt_lang
|
63 |
+
else
|
64 |
+
SRC_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$src_lang
|
65 |
+
REF_FILE=${expdir}/devtest/$tset/en-${indic_lang}/test.$tgt_lang
|
66 |
+
fi
|
67 |
+
RESULTS_DIR=${expdir}/results/$tset
|
68 |
+
|
69 |
+
mkdir -p $RESULTS_DIR
|
70 |
+
|
71 |
+
bash joint_translate.sh $SRC_FILE $RESULTS_DIR/${src_lang}-${tgt_lang} $src_lang $tgt_lang $expdir $REF_FILE
|
72 |
+
# for newline between different outputs
|
73 |
+
echo
|
74 |
+
done
|
legacy/tpu_training_instructions.md
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## Instructions to run on Google cloud TPUs
|
2 |
+
Before starting these steps, make sure to prepare the dataset (normalization -> bpe -> .. -> binarization) following the steps in indicTrans workflow or do these steps on a cpu instance before launching the tpu instance (to save time and costs)
|
3 |
+
|
4 |
+
### Creating TPU instance
|
5 |
+
|
6 |
+
- Create a cpu instance on gcp with `torch-xla` image like:
|
7 |
+
```bash
|
8 |
+
gcloud compute --project=${PROJECT_ID} instances create <name for your instance> \
|
9 |
+
--zone=<zone> \
|
10 |
+
--machine-type=n1-standard-16 \
|
11 |
+
--image-family=torch-xla \
|
12 |
+
--image-project=ml-images \
|
13 |
+
--boot-disk-size=200GB \
|
14 |
+
--scopes=https://www.googleapis.com/auth/cloud-platform
|
15 |
+
```
|
16 |
+
- Once the instance is created, Launch a Cloud TPU (from your cpu vm instance) using the following command (you can change the `accelerator_type` according to your needs):
|
17 |
+
```bash
|
18 |
+
gcloud compute tpus create <name for your TPU> \
|
19 |
+
--zone=<zone> \
|
20 |
+
--network=default \
|
21 |
+
--version=pytorch-1.7 \
|
22 |
+
--accelerator-type=v3-8
|
23 |
+
```
|
24 |
+
(or)
|
25 |
+
Create a new tpu using the GUI in https://console.cloud.google.com/compute/tpus and make sure to select `version` as `pytorch 1.7`.
|
26 |
+
|
27 |
+
- Once the tpu is launched, identify its ip address:
|
28 |
+
```bash
|
29 |
+
# you can run this inside cpu instance and note down the IP address which is located under the NETWORK_ENDPOINTS column
|
30 |
+
gcloud compute tpus list --zone=us-central1-a
|
31 |
+
```
|
32 |
+
(or)
|
33 |
+
Go to https://console.cloud.google.com/compute/tpus and note down ip address for the created TPU from the `interal ip` column
|
34 |
+
|
35 |
+
### Installing Fairseq, getting data on the cpu instance
|
36 |
+
|
37 |
+
- Activate the `torch xla 1.7` conda environment and install necessary libs for IndicTrans (**Excluding FairSeq**):
|
38 |
+
```bash
|
39 |
+
conda activate torch-xla-1.7
|
40 |
+
pip install sacremoses pandas mock sacrebleu tensorboardX pyarrow
|
41 |
+
```
|
42 |
+
- Configure environment variables for TPU:
|
43 |
+
```bash
|
44 |
+
export TPU_IP_ADDRESS=ip-address; \
|
45 |
+
export XRT_TPU_CONFIG="tpu_worker;0;$TPU_IP_ADDRESS:8470"
|
46 |
+
```
|
47 |
+
- Download the prepared binarized data for FairSeq
|
48 |
+
|
49 |
+
- Clone the latest version of Fairseq (this supports tpu) and install from source. There is an [issue](https://github.com/pytorch/fairseq/issues/3259) with the latest commit and hence we use a different commit to install from source (This may have been fixed in the latest master but we have not tested it.)
|
50 |
+
```bash
|
51 |
+
git clone https://github.com/pytorch/fairseq.git
|
52 |
+
git checkout da9eaba12d82b9bfc1442f0e2c6fc1b895f4d35d
|
53 |
+
pip install --editable ./
|
54 |
+
```
|
55 |
+
|
56 |
+
- Start TPU training
|
57 |
+
```bash
|
58 |
+
# this is for using all tpu cores
|
59 |
+
export MKL_SERVICE_FORCE_INTEL=1
|
60 |
+
|
61 |
+
fairseq-train {expdir}/exp2_m2o_baseline/final_bin \
|
62 |
+
--max-source-positions=200 \
|
63 |
+
--max-target-positions=200 \
|
64 |
+
--max-update=1000000 \
|
65 |
+
--save-interval=5 \
|
66 |
+
--arch=transformer \
|
67 |
+
--attention-dropout=0.1 \
|
68 |
+
--criterion=label_smoothed_cross_entropy \
|
69 |
+
--source-lang=SRC \
|
70 |
+
--lr-scheduler=inverse_sqrt \
|
71 |
+
--skip-invalid-size-inputs-valid-test \
|
72 |
+
--target-lang=TGT \
|
73 |
+
--label-smoothing=0.1 \
|
74 |
+
--update-freq=1 \
|
75 |
+
--optimizer adam \
|
76 |
+
--adam-betas '(0.9, 0.98)' \
|
77 |
+
--warmup-init-lr 1e-07 \
|
78 |
+
--lr 0.0005 \
|
79 |
+
--warmup-updates 4000 \
|
80 |
+
--dropout 0.2 \
|
81 |
+
--weight-decay 0.0 \
|
82 |
+
--tpu \
|
83 |
+
--distributed-world-size 8 \
|
84 |
+
--max-tokens 8192 \
|
85 |
+
--num-batch-buckets 8 \
|
86 |
+
--tensorboard-logdir {expdir}/exp2_m2o_baseline/tensorboard \
|
87 |
+
--save-dir {expdir}/exp2_m2o_baseline/model \
|
88 |
+
--keep-last-epochs 5 \
|
89 |
+
--patience 5
|
90 |
+
```
|
91 |
+
|
92 |
+
**Note** While training, we noticed that the training was slower on tpus, compared to using multiple GPUs, we have documented some issues and [filed an issue](https://github.com/pytorch/fairseq/issues/3317) at fairseq repo for advice. We'll update this section as we learn more about efficient training on TPUs. Also feel free to open an issue/pull request if you find a bug or know an efficient method to make code train faster on tpus.
|
legacy/translate.sh
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
echo `date`
|
3 |
+
infname=$1
|
4 |
+
outfname=$2
|
5 |
+
src_lang=$3
|
6 |
+
tgt_lang=$4
|
7 |
+
exp_dir=$5
|
8 |
+
ref_fname=$6
|
9 |
+
|
10 |
+
if [ $src_lang == 'en' ]; then
|
11 |
+
SRC_PREFIX='TGT'
|
12 |
+
TGT_PREFIX='SRC'
|
13 |
+
else
|
14 |
+
SRC_PREFIX='SRC'
|
15 |
+
TGT_PREFIX='TGT'
|
16 |
+
fi
|
17 |
+
|
18 |
+
#`dirname $0`/env.sh
|
19 |
+
SUBWORD_NMT_DIR='subword-nmt'
|
20 |
+
model_dir=$exp_dir/model
|
21 |
+
data_bin_dir=$exp_dir/final_bin
|
22 |
+
|
23 |
+
### normalization and script conversion
|
24 |
+
|
25 |
+
echo "Applying normalization and script conversion"
|
26 |
+
input_size=`python preprocess_translate.py $infname $outfname.norm $src_lang`
|
27 |
+
echo "Number of sentences in input: $input_size"
|
28 |
+
|
29 |
+
### apply BPE to input file
|
30 |
+
|
31 |
+
echo "Applying BPE"
|
32 |
+
python $SUBWORD_NMT_DIR/subword_nmt/apply_bpe.py \
|
33 |
+
-c $exp_dir/vocab/bpe_codes.32k.${SRC_PREFIX}_${TGT_PREFIX} \
|
34 |
+
--vocabulary $exp_dir/vocab/vocab.$SRC_PREFIX \
|
35 |
+
--vocabulary-threshold 5 \
|
36 |
+
< $outfname.norm \
|
37 |
+
> $outfname.bpe
|
38 |
+
|
39 |
+
# not needed for joint training
|
40 |
+
# echo "Adding language tags"
|
41 |
+
# python add_tags_translate.py $outfname._bpe $outfname.bpe $src_lang $tgt_lang
|
42 |
+
|
43 |
+
### run decoder
|
44 |
+
|
45 |
+
echo "Decoding"
|
46 |
+
|
47 |
+
src_input_bpe_fname=$outfname.bpe
|
48 |
+
tgt_output_fname=$outfname
|
49 |
+
fairseq-interactive $data_bin_dir \
|
50 |
+
-s $SRC_PREFIX -t $TGT_PREFIX \
|
51 |
+
--distributed-world-size 1 \
|
52 |
+
--path $model_dir/checkpoint_best.pt \
|
53 |
+
--batch-size 64 --buffer-size 2500 --beam 5 --remove-bpe \
|
54 |
+
--skip-invalid-size-inputs-valid-test \
|
55 |
+
--input $src_input_bpe_fname > $tgt_output_fname.log 2>&1
|
56 |
+
|
57 |
+
|
58 |
+
echo "Extracting translations, script conversion and detokenization"
|
59 |
+
python postprocess_translate.py $tgt_output_fname.log $tgt_output_fname $input_size $tgt_lang
|
60 |
+
if [ $src_lang == 'en' ]; then
|
61 |
+
# indicnlp tokenize the output files before evaluation
|
62 |
+
input_size=`python preprocess_translate.py $ref_fname $ref_fname.tok $tgt_lang`
|
63 |
+
input_size=`python preprocess_translate.py $tgt_output_fname $tgt_output_fname.tok $tgt_lang`
|
64 |
+
sacrebleu --tokenize none $ref_fname.tok < $tgt_output_fname.tok
|
65 |
+
else
|
66 |
+
# indic to en models
|
67 |
+
sacrebleu $ref_fname < $tgt_output_fname
|
68 |
+
fi
|
69 |
+
echo `date`
|
70 |
+
echo "Translation completed"
|
model_configs/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from . import custom_transformer
|
model_configs/custom_transformer.py
ADDED
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from fairseq.models import register_model_architecture
|
2 |
+
from fairseq.models.transformer import base_architecture
|
3 |
+
|
4 |
+
|
5 |
+
@register_model_architecture("transformer", "transformer_2x")
|
6 |
+
def transformer_big(args):
|
7 |
+
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1024)
|
8 |
+
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
|
9 |
+
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
|
10 |
+
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
|
11 |
+
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1024)
|
12 |
+
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
|
13 |
+
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
|
14 |
+
base_architecture(args)
|
15 |
+
|
16 |
+
|
17 |
+
@register_model_architecture("transformer", "transformer_4x")
|
18 |
+
def transformer_huge(args):
|
19 |
+
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 1536)
|
20 |
+
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 4096)
|
21 |
+
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
|
22 |
+
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
|
23 |
+
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 1536)
|
24 |
+
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 4096)
|
25 |
+
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
|
26 |
+
base_architecture(args)
|
27 |
+
|
28 |
+
|
29 |
+
@register_model_architecture("transformer", "transformer_9x")
|
30 |
+
def transformer_xlarge(args):
|
31 |
+
args.encoder_embed_dim = getattr(args, "encoder_embed_dim", 2048)
|
32 |
+
args.encoder_ffn_embed_dim = getattr(args, "encoder_ffn_embed_dim", 8192)
|
33 |
+
args.encoder_attention_heads = getattr(args, "encoder_attention_heads", 16)
|
34 |
+
args.encoder_normalize_before = getattr(args, "encoder_normalize_before", False)
|
35 |
+
args.decoder_embed_dim = getattr(args, "decoder_embed_dim", 2048)
|
36 |
+
args.decoder_ffn_embed_dim = getattr(args, "decoder_ffn_embed_dim", 8192)
|
37 |
+
args.decoder_attention_heads = getattr(args, "decoder_attention_heads", 16)
|
38 |
+
base_architecture(args)
|
requirements.txt
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sacremoses
|
2 |
+
pandas
|
3 |
+
mock
|
4 |
+
sacrebleu
|
5 |
+
pyarrow
|
6 |
+
indic-nlp-library
|
7 |
+
mosestokenizer
|
8 |
+
subword-nmt
|
9 |
+
numpy
|
10 |
+
tensorboardX
|
11 |
+
git+https://github.com/pytorch/fairseq.git
|
scripts/__init__.py
ADDED
File without changes
|
scripts/add_joint_tags_translate.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
from tqdm import tqdm
|
3 |
+
import os
|
4 |
+
|
5 |
+
|
6 |
+
def add_token(sent, tag_infos):
|
7 |
+
""" add special tokens specified by tag_infos to each element in list
|
8 |
+
|
9 |
+
tag_infos: list of tuples (tag_type,tag)
|
10 |
+
|
11 |
+
each tag_info results in a token of the form: __{tag_type}__{tag}__
|
12 |
+
|
13 |
+
"""
|
14 |
+
|
15 |
+
tokens = []
|
16 |
+
for tag_type, tag in tag_infos:
|
17 |
+
token = '__' + tag_type + '__' + tag + '__'
|
18 |
+
tokens.append(token)
|
19 |
+
|
20 |
+
return ' '.join(tokens) + ' ' + sent
|
21 |
+
|
22 |
+
|
23 |
+
def generate_lang_tag_iterator(infname):
|
24 |
+
with open(infname, 'r', encoding='utf-8') as infile:
|
25 |
+
for line in infile:
|
26 |
+
src, tgt, count = line.strip().split('\t')
|
27 |
+
count = int(count)
|
28 |
+
for _ in range(count):
|
29 |
+
yield (src, tgt)
|
30 |
+
|
31 |
+
|
32 |
+
if __name__ == '__main__':
|
33 |
+
|
34 |
+
expdir = sys.argv[1]
|
35 |
+
dset = sys.argv[2]
|
36 |
+
|
37 |
+
src_fname = '{expdir}/bpe/{dset}.SRC'.format(
|
38 |
+
expdir=expdir, dset=dset)
|
39 |
+
tgt_fname = '{expdir}/bpe/{dset}.TGT'.format(
|
40 |
+
expdir=expdir, dset=dset)
|
41 |
+
meta_fname = '{expdir}/data/{dset}_lang_pairs.txt'.format(
|
42 |
+
expdir=expdir, dset=dset)
|
43 |
+
|
44 |
+
out_src_fname = '{expdir}/final/{dset}.SRC'.format(
|
45 |
+
expdir=expdir, dset=dset)
|
46 |
+
out_tgt_fname = '{expdir}/final/{dset}.TGT'.format(
|
47 |
+
expdir=expdir, dset=dset)
|
48 |
+
lang_tag_iterator = generate_lang_tag_iterator(meta_fname)
|
49 |
+
|
50 |
+
os.makedirs('{expdir}/final'.format(expdir=expdir), exist_ok=True)
|
51 |
+
|
52 |
+
with open(src_fname, 'r', encoding='utf-8') as srcfile, \
|
53 |
+
open(tgt_fname, 'r', encoding='utf-8') as tgtfile, \
|
54 |
+
open(out_src_fname, 'w', encoding='utf-8') as outsrcfile, \
|
55 |
+
open(out_tgt_fname, 'w', encoding='utf-8') as outtgtfile:
|
56 |
+
|
57 |
+
for (l1, l2), src_sent, tgt_sent in tqdm(zip(lang_tag_iterator,
|
58 |
+
srcfile, tgtfile)):
|
59 |
+
outsrcfile.write(add_token(src_sent.strip(), [
|
60 |
+
('src', l1), ('tgt', l2)]) + '\n')
|
61 |
+
outtgtfile.write(tgt_sent.strip() + '\n')
|
scripts/add_tags_translate.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
|
4 |
+
def add_token(sent, tag_infos):
|
5 |
+
""" add special tokens specified by tag_infos to each element in list
|
6 |
+
|
7 |
+
tag_infos: list of tuples (tag_type,tag)
|
8 |
+
|
9 |
+
each tag_info results in a token of the form: __{tag_type}__{tag}__
|
10 |
+
|
11 |
+
"""
|
12 |
+
|
13 |
+
tokens = []
|
14 |
+
for tag_type, tag in tag_infos:
|
15 |
+
token = '__' + tag_type + '__' + tag + '__'
|
16 |
+
tokens.append(token)
|
17 |
+
|
18 |
+
return ' '.join(tokens) + ' ' + sent
|
19 |
+
|
20 |
+
|
21 |
+
if __name__ == '__main__':
|
22 |
+
|
23 |
+
infname = sys.argv[1]
|
24 |
+
outfname = sys.argv[2]
|
25 |
+
src_lang = sys.argv[3]
|
26 |
+
tgt_lang = sys.argv[4]
|
27 |
+
|
28 |
+
with open(infname, 'r', encoding='utf-8') as infile, \
|
29 |
+
open(outfname, 'w', encoding='utf-8') as outfile:
|
30 |
+
for line in infile:
|
31 |
+
outstr = add_token(
|
32 |
+
line.strip(), [('src', src_lang), ('tgt', tgt_lang)])
|
33 |
+
outfile.write(outstr + '\n')
|
scripts/clean_vocab.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import codecs
|
3 |
+
|
4 |
+
def clean_vocab(in_vocab_fname, out_vocab_fname):
|
5 |
+
with codecs.open(in_vocab_fname, "r", encoding="utf-8") as infile, codecs.open(
|
6 |
+
out_vocab_fname, "w", encoding="utf-8"
|
7 |
+
) as outfile:
|
8 |
+
for i, line in enumerate(infile):
|
9 |
+
fields = line.strip("\r\n ").split(" ")
|
10 |
+
if len(fields) == 2:
|
11 |
+
outfile.write(line)
|
12 |
+
if len(fields) != 2:
|
13 |
+
print("{}: {}".format(i, line.strip()))
|
14 |
+
for c in line:
|
15 |
+
print("{}:{}".format(c, hex(ord(c))))
|
16 |
+
|
17 |
+
|
18 |
+
if __name__ == "__main__":
|
19 |
+
clean_vocab(sys.argv[1], sys.argv[2])
|
scripts/concat_joint_data.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from tqdm import tqdm
|
3 |
+
import sys
|
4 |
+
|
5 |
+
LANGS = [
|
6 |
+
"as",
|
7 |
+
"bn",
|
8 |
+
"gu",
|
9 |
+
"hi",
|
10 |
+
"kn",
|
11 |
+
"ml",
|
12 |
+
"mr",
|
13 |
+
"or",
|
14 |
+
"pa",
|
15 |
+
"ta",
|
16 |
+
"te",
|
17 |
+
#"ur"
|
18 |
+
]
|
19 |
+
|
20 |
+
|
21 |
+
def add_token(sent, tag_infos):
|
22 |
+
""" add special tokens specified by tag_infos to each element in list
|
23 |
+
|
24 |
+
tag_infos: list of tuples (tag_type,tag)
|
25 |
+
|
26 |
+
each tag_info results in a token of the form: __{tag_type}__{tag}__
|
27 |
+
|
28 |
+
"""
|
29 |
+
|
30 |
+
tokens = []
|
31 |
+
for tag_type, tag in tag_infos:
|
32 |
+
token = '__' + tag_type + '__' + tag + '__'
|
33 |
+
tokens.append(token)
|
34 |
+
|
35 |
+
return ' '.join(tokens) + ' ' + sent
|
36 |
+
|
37 |
+
|
38 |
+
def concat_data(data_dir, outdir, lang_pair_list,
|
39 |
+
out_src_lang='SRC', out_trg_lang='TGT', split='train'):
|
40 |
+
"""
|
41 |
+
data_dir: input dir, contains directories for language pairs named l1-l2
|
42 |
+
"""
|
43 |
+
os.makedirs(outdir, exist_ok=True)
|
44 |
+
|
45 |
+
out_src_fname = '{}/{}.{}'.format(outdir, split, out_src_lang)
|
46 |
+
out_trg_fname = '{}/{}.{}'.format(outdir, split, out_trg_lang)
|
47 |
+
# out_meta_fname='{}/metadata.txt'.format(outdir)
|
48 |
+
|
49 |
+
print()
|
50 |
+
print(out_src_fname)
|
51 |
+
print(out_trg_fname)
|
52 |
+
# print(out_meta_fname)
|
53 |
+
|
54 |
+
# concatenate train data
|
55 |
+
if os.path.isfile(out_src_fname):
|
56 |
+
os.unlink(out_src_fname)
|
57 |
+
if os.path.isfile(out_trg_fname):
|
58 |
+
os.unlink(out_trg_fname)
|
59 |
+
# if os.path.isfile(out_meta_fname):
|
60 |
+
# os.unlink(out_meta_fname)
|
61 |
+
|
62 |
+
for src_lang, trg_lang in tqdm(lang_pair_list):
|
63 |
+
print('src: {}, tgt:{}'.format(src_lang, trg_lang))
|
64 |
+
|
65 |
+
in_src_fname = '{}/{}-{}/{}.{}'.format(
|
66 |
+
data_dir, src_lang, trg_lang, split, src_lang)
|
67 |
+
in_trg_fname = '{}/{}-{}/{}.{}'.format(
|
68 |
+
data_dir, src_lang, trg_lang, split, trg_lang)
|
69 |
+
|
70 |
+
if not os.path.exists(in_src_fname):
|
71 |
+
continue
|
72 |
+
if not os.path.exists(in_trg_fname):
|
73 |
+
continue
|
74 |
+
|
75 |
+
print(in_src_fname)
|
76 |
+
os.system('cat {} >> {}'.format(in_src_fname, out_src_fname))
|
77 |
+
|
78 |
+
print(in_trg_fname)
|
79 |
+
os.system('cat {} >> {}'.format(in_trg_fname, out_trg_fname))
|
80 |
+
|
81 |
+
|
82 |
+
# with open('{}/lang_pairs.txt'.format(outdir),'w',encoding='utf-8') as lpfile:
|
83 |
+
# lpfile.write('\n'.join( [ '-'.join(x) for x in lang_pair_list ] ))
|
84 |
+
|
85 |
+
corpus_stats(data_dir, outdir, lang_pair_list, split)
|
86 |
+
|
87 |
+
|
88 |
+
def corpus_stats(data_dir, outdir, lang_pair_list, split):
|
89 |
+
"""
|
90 |
+
data_dir: input dir, contains directories for language pairs named l1-l2
|
91 |
+
"""
|
92 |
+
|
93 |
+
with open('{}/{}_lang_pairs.txt'.format(outdir, split), 'w', encoding='utf-8') as lpfile:
|
94 |
+
|
95 |
+
for src_lang, trg_lang in tqdm(lang_pair_list):
|
96 |
+
print('src: {}, tgt:{}'.format(src_lang, trg_lang))
|
97 |
+
|
98 |
+
in_src_fname = '{}/{}-{}/{}.{}'.format(
|
99 |
+
data_dir, src_lang, trg_lang, split, src_lang)
|
100 |
+
# in_trg_fname='{}/{}-{}/train.{}'.format(data_dir,src_lang,trg_lang,trg_lang)
|
101 |
+
if not os.path.exists(in_src_fname):
|
102 |
+
continue
|
103 |
+
|
104 |
+
print(in_src_fname)
|
105 |
+
corpus_size = 0
|
106 |
+
with open(in_src_fname, 'r', encoding='utf-8') as infile:
|
107 |
+
corpus_size = sum(map(lambda x: 1, infile))
|
108 |
+
|
109 |
+
lpfile.write('{}\t{}\t{}\n'.format(
|
110 |
+
src_lang, trg_lang, corpus_size))
|
111 |
+
|
112 |
+
|
113 |
+
if __name__ == '__main__':
|
114 |
+
|
115 |
+
in_dir = sys.argv[1]
|
116 |
+
out_dir = sys.argv[2]
|
117 |
+
src_lang = sys.argv[3]
|
118 |
+
tgt_lang = sys.argv[4]
|
119 |
+
split = sys.argv[5]
|
120 |
+
lang_pair_list = []
|
121 |
+
|
122 |
+
if src_lang == 'en':
|
123 |
+
for lang in LANGS:
|
124 |
+
lang_pair_list.append(['en', lang])
|
125 |
+
else:
|
126 |
+
for lang in LANGS:
|
127 |
+
lang_pair_list.append([lang, 'en'])
|
128 |
+
|
129 |
+
concat_data(in_dir, out_dir, lang_pair_list, split=split)
|
130 |
+
|
scripts/extract_non_english_pairs.py
ADDED
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
import os
|
3 |
+
from collections import defaultdict
|
4 |
+
|
5 |
+
|
6 |
+
def read_file(fname):
|
7 |
+
with open(fname, "r", encoding="utf-8") as infile:
|
8 |
+
for line in infile:
|
9 |
+
yield line.strip()
|
10 |
+
|
11 |
+
|
12 |
+
def extract_non_english_pairs(indir, outdir, LANGS):
|
13 |
+
"""
|
14 |
+
Extracts non-english pair parallel corpora
|
15 |
+
|
16 |
+
indir: contains english centric data in the following form:
|
17 |
+
- directory named en-xx for language xx
|
18 |
+
- each directory contains a train.en and train.xx
|
19 |
+
outdir: output directory to store mined data for each pair.
|
20 |
+
One directory is created for each pair.
|
21 |
+
LANGS: list of languages in the corpus (other than English).
|
22 |
+
The language codes must correspond to the ones used in the
|
23 |
+
files and directories in indir. Prefarably, sort the languages
|
24 |
+
in this list in alphabetic order. outdir will contain data for xx-yy,
|
25 |
+
but not for yy-xx, so it will be convenient to have this list in sorted order.
|
26 |
+
"""
|
27 |
+
|
28 |
+
for i in tqdm(range(len(LANGS) - 1)):
|
29 |
+
print()
|
30 |
+
for j in range(i + 1, len(LANGS)):
|
31 |
+
lang1 = LANGS[i]
|
32 |
+
lang2 = LANGS[j]
|
33 |
+
# print()
|
34 |
+
print("{} {}".format(lang1, lang2))
|
35 |
+
|
36 |
+
fname1 = "{}/en-{}/train.en".format(indir, lang1)
|
37 |
+
fname2 = "{}/en-{}/train.en".format(indir, lang2)
|
38 |
+
# print(fname1)
|
39 |
+
# print(fname2)
|
40 |
+
enset_l1 = set(read_file(fname1))
|
41 |
+
common_en_set = enset_l1.intersection(read_file(fname2))
|
42 |
+
|
43 |
+
## this block should be used if you want to consider multiple translations.
|
44 |
+
# il_fname1 = "{}/en-{}/train.{}".format(indir, lang1, lang1)
|
45 |
+
# en_lang1_dict = defaultdict(list)
|
46 |
+
# for en_line, il_line in zip(read_file(fname1), read_file(il_fname1)):
|
47 |
+
# if en_line in common_en_set:
|
48 |
+
# en_lang1_dict[en_line].append(il_line)
|
49 |
+
|
50 |
+
# # this block should be used if you DONT to consider multiple translation.
|
51 |
+
il_fname1='{}/en-{}/train.{}'.format(indir,lang1,lang1)
|
52 |
+
en_lang1_dict={}
|
53 |
+
for en_line,il_line in zip(read_file(fname1),read_file(il_fname1)):
|
54 |
+
if en_line in common_en_set:
|
55 |
+
en_lang1_dict[en_line]=il_line
|
56 |
+
|
57 |
+
os.makedirs("{}/{}-{}".format(outdir, lang1, lang2), exist_ok=True)
|
58 |
+
out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
|
59 |
+
o=outdir, l1=lang1, l2=lang2
|
60 |
+
)
|
61 |
+
out_l2_fname = "{o}/{l1}-{l2}/train.{l2}".format(
|
62 |
+
o=outdir, l1=lang1, l2=lang2
|
63 |
+
)
|
64 |
+
|
65 |
+
il_fname2 = "{}/en-{}/train.{}".format(indir, lang2, lang2)
|
66 |
+
with open(out_l1_fname, "w", encoding="utf-8") as out_l1_file, open(
|
67 |
+
out_l2_fname, "w", encoding="utf-8"
|
68 |
+
) as out_l2_file:
|
69 |
+
for en_line, il_line in zip(read_file(fname2), read_file(il_fname2)):
|
70 |
+
if en_line in en_lang1_dict:
|
71 |
+
|
72 |
+
# this block should be used if you want to consider multiple tranlations.
|
73 |
+
for il_line_lang1 in en_lang1_dict[en_line]:
|
74 |
+
# lang1_line, lang2_line = il_line_lang1, il_line
|
75 |
+
# out_l1_file.write(lang1_line + "\n")
|
76 |
+
# out_l2_file.write(lang2_line + "\n")
|
77 |
+
|
78 |
+
# this block should be used if you DONT to consider multiple translation.
|
79 |
+
lang1_line, lang2_line = en_lang1_dict[en_line], il_line
|
80 |
+
out_l1_file.write(lang1_line+'\n')
|
81 |
+
out_l2_file.write(lang2_line+'\n')
|
82 |
+
|
83 |
+
|
84 |
+
def get_extracted_stats(outdir, LANGS):
|
85 |
+
"""
|
86 |
+
gathers stats from the extracted directories
|
87 |
+
|
88 |
+
outdir: output directory to store mined data for each pair.
|
89 |
+
One directory is created for each pair.
|
90 |
+
LANGS: list of languages in the corpus (other than languages).
|
91 |
+
The language codes must correspond to the ones used in the
|
92 |
+
files and directories in indir. Prefarably, sort the languages
|
93 |
+
in this list in alphabetic order. outdir will contain data for xx-yy,
|
94 |
+
"""
|
95 |
+
common_stats = []
|
96 |
+
for i in tqdm(range(len(LANGS) - 1)):
|
97 |
+
for j in range(i + 1, len(LANGS)):
|
98 |
+
lang1 = LANGS[i]
|
99 |
+
lang2 = LANGS[j]
|
100 |
+
|
101 |
+
out_l1_fname = "{o}/{l1}-{l2}/train.{l1}".format(
|
102 |
+
o=outdir, l1=lang1, l2=lang2
|
103 |
+
)
|
104 |
+
|
105 |
+
cnt = sum([1 for _ in read_file(out_l1_fname)])
|
106 |
+
common_stats.append((lang1, lang2, cnt))
|
107 |
+
common_stats.append((lang2, lang1, cnt))
|
108 |
+
return common_stats
|
scripts/postprocess_score.py
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
|
3 |
+
def postprocess(
|
4 |
+
infname, outfname, input_size
|
5 |
+
):
|
6 |
+
"""
|
7 |
+
parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.
|
8 |
+
|
9 |
+
infname: fairseq log file
|
10 |
+
outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
|
11 |
+
input_size: expected number of output sentences
|
12 |
+
"""
|
13 |
+
|
14 |
+
consolidated_testoutput = []
|
15 |
+
# with open(infname,'r',encoding='utf-8') as infile:
|
16 |
+
# consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) ))
|
17 |
+
# consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1]))
|
18 |
+
# consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ]
|
19 |
+
|
20 |
+
consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)]
|
21 |
+
temp_testoutput = []
|
22 |
+
with open(infname, "r", encoding="utf-8") as infile:
|
23 |
+
temp_testoutput = list(
|
24 |
+
map(
|
25 |
+
lambda x: x.strip().split("\t"),
|
26 |
+
filter(lambda x: x.startswith("H-"), infile),
|
27 |
+
)
|
28 |
+
)
|
29 |
+
temp_testoutput = list(
|
30 |
+
map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)
|
31 |
+
)
|
32 |
+
for sid, score, hyp in temp_testoutput:
|
33 |
+
consolidated_testoutput[sid] = (sid, score, hyp)
|
34 |
+
#consolidated_testoutput = [x[2] for x in consolidated_testoutput]
|
35 |
+
|
36 |
+
with open(outfname, "w", encoding="utf-8") as outfile:
|
37 |
+
for (sid, score, hyp) in consolidated_testoutput:
|
38 |
+
outfile.write("{}\n".format(score))
|
39 |
+
|
40 |
+
if __name__ == "__main__":
|
41 |
+
|
42 |
+
infname = sys.argv[1]
|
43 |
+
outfname = sys.argv[2]
|
44 |
+
input_size = int(sys.argv[3])
|
45 |
+
|
46 |
+
postprocess(
|
47 |
+
infname, outfname, input_size
|
48 |
+
)
|
scripts/postprocess_translate.py
ADDED
@@ -0,0 +1,110 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INDIC_NLP_LIB_HOME = "indic_nlp_library"
|
2 |
+
INDIC_NLP_RESOURCES = "indic_nlp_resources"
|
3 |
+
import sys
|
4 |
+
|
5 |
+
from indicnlp import transliterate
|
6 |
+
|
7 |
+
sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
|
8 |
+
from indicnlp import common
|
9 |
+
|
10 |
+
common.set_resources_path(INDIC_NLP_RESOURCES)
|
11 |
+
from indicnlp import loader
|
12 |
+
|
13 |
+
loader.load()
|
14 |
+
from sacremoses import MosesPunctNormalizer
|
15 |
+
from sacremoses import MosesTokenizer
|
16 |
+
from sacremoses import MosesDetokenizer
|
17 |
+
from collections import defaultdict
|
18 |
+
|
19 |
+
import indicnlp
|
20 |
+
from indicnlp.tokenize import indic_tokenize
|
21 |
+
from indicnlp.tokenize import indic_detokenize
|
22 |
+
from indicnlp.normalize import indic_normalize
|
23 |
+
from indicnlp.transliterate import unicode_transliterate
|
24 |
+
|
25 |
+
|
26 |
+
def postprocess(
|
27 |
+
infname, outfname, input_size, lang, common_lang="hi", transliterate=False
|
28 |
+
):
|
29 |
+
"""
|
30 |
+
parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.
|
31 |
+
|
32 |
+
infname: fairseq log file
|
33 |
+
outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
|
34 |
+
input_size: expected number of output sentences
|
35 |
+
lang: language
|
36 |
+
"""
|
37 |
+
|
38 |
+
consolidated_testoutput = []
|
39 |
+
# with open(infname,'r',encoding='utf-8') as infile:
|
40 |
+
# consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) ))
|
41 |
+
# consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1]))
|
42 |
+
# consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ]
|
43 |
+
|
44 |
+
consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)]
|
45 |
+
temp_testoutput = []
|
46 |
+
with open(infname, "r", encoding="utf-8") as infile:
|
47 |
+
temp_testoutput = list(
|
48 |
+
map(
|
49 |
+
lambda x: x.strip().split("\t"),
|
50 |
+
filter(lambda x: x.startswith("H-"), infile),
|
51 |
+
)
|
52 |
+
)
|
53 |
+
temp_testoutput = list(
|
54 |
+
map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)
|
55 |
+
)
|
56 |
+
for sid, score, hyp in temp_testoutput:
|
57 |
+
consolidated_testoutput[sid] = (sid, score, hyp)
|
58 |
+
consolidated_testoutput = [x[2] for x in consolidated_testoutput]
|
59 |
+
|
60 |
+
if lang == "en":
|
61 |
+
en_detok = MosesDetokenizer(lang="en")
|
62 |
+
with open(outfname, "w", encoding="utf-8") as outfile:
|
63 |
+
for sent in consolidated_testoutput:
|
64 |
+
outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
|
65 |
+
else:
|
66 |
+
xliterator = unicode_transliterate.UnicodeIndicTransliterator()
|
67 |
+
with open(outfname, "w", encoding="utf-8") as outfile:
|
68 |
+
for sent in consolidated_testoutput:
|
69 |
+
if transliterate:
|
70 |
+
outstr = indic_detokenize.trivial_detokenize(
|
71 |
+
xliterator.transliterate(sent, common_lang, lang), lang
|
72 |
+
)
|
73 |
+
else:
|
74 |
+
outstr = indic_detokenize.trivial_detokenize(sent, lang)
|
75 |
+
outfile.write(outstr + "\n")
|
76 |
+
|
77 |
+
|
78 |
+
if __name__ == "__main__":
|
79 |
+
# # The path to the local git repo for Indic NLP library
|
80 |
+
# INDIC_NLP_LIB_HOME="indic_nlp_library"
|
81 |
+
# INDIC_NLP_RESOURCES = "indic_nlp_resources"
|
82 |
+
# sys.path.append('{}'.format(INDIC_NLP_LIB_HOME))
|
83 |
+
# common.set_resources_path(INDIC_NLP_RESOURCES)
|
84 |
+
# # The path to the local git repo for Indic NLP Resources
|
85 |
+
# INDIC_NLP_RESOURCES=""
|
86 |
+
|
87 |
+
# sys.path.append('{}'.format(INDIC_NLP_LIB_HOME))
|
88 |
+
# common.set_resources_path(INDIC_NLP_RESOURCES)
|
89 |
+
|
90 |
+
# loader.load()
|
91 |
+
|
92 |
+
infname = sys.argv[1]
|
93 |
+
outfname = sys.argv[2]
|
94 |
+
input_size = int(sys.argv[3])
|
95 |
+
lang = sys.argv[4]
|
96 |
+
if len(sys.argv) == 5:
|
97 |
+
transliterate = False
|
98 |
+
elif len(sys.argv) == 6:
|
99 |
+
transliterate = sys.argv[5]
|
100 |
+
if transliterate.lower() == "true":
|
101 |
+
transliterate = True
|
102 |
+
else:
|
103 |
+
transliterate = False
|
104 |
+
else:
|
105 |
+
print(f"Invalid arguments: {sys.argv}")
|
106 |
+
exit()
|
107 |
+
|
108 |
+
postprocess(
|
109 |
+
infname, outfname, input_size, lang, common_lang="hi", transliterate=transliterate
|
110 |
+
)
|
scripts/preprocess_translate.py
ADDED
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
INDIC_NLP_LIB_HOME = "indic_nlp_library"
|
2 |
+
INDIC_NLP_RESOURCES = "indic_nlp_resources"
|
3 |
+
import sys
|
4 |
+
|
5 |
+
sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
|
6 |
+
from indicnlp import common
|
7 |
+
|
8 |
+
common.set_resources_path(INDIC_NLP_RESOURCES)
|
9 |
+
from indicnlp import loader
|
10 |
+
|
11 |
+
loader.load()
|
12 |
+
from sacremoses import MosesPunctNormalizer
|
13 |
+
from sacremoses import MosesTokenizer
|
14 |
+
from sacremoses import MosesDetokenizer
|
15 |
+
from collections import defaultdict
|
16 |
+
|
17 |
+
from tqdm import tqdm
|
18 |
+
from joblib import Parallel, delayed
|
19 |
+
|
20 |
+
from indicnlp.tokenize import indic_tokenize
|
21 |
+
from indicnlp.tokenize import indic_detokenize
|
22 |
+
from indicnlp.normalize import indic_normalize
|
23 |
+
from indicnlp.transliterate import unicode_transliterate
|
24 |
+
|
25 |
+
|
26 |
+
en_tok = MosesTokenizer(lang="en")
|
27 |
+
en_normalizer = MosesPunctNormalizer()
|
28 |
+
|
29 |
+
|
30 |
+
def preprocess_line(line, normalizer, lang, transliterate=False):
|
31 |
+
if lang == "en":
|
32 |
+
return " ".join(
|
33 |
+
en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)
|
34 |
+
)
|
35 |
+
elif transliterate:
|
36 |
+
# line = indic_detokenize.trivial_detokenize(line.strip(), lang)
|
37 |
+
return unicode_transliterate.UnicodeIndicTransliterator.transliterate(
|
38 |
+
" ".join(
|
39 |
+
indic_tokenize.trivial_tokenize(
|
40 |
+
normalizer.normalize(line.strip()), lang
|
41 |
+
)
|
42 |
+
),
|
43 |
+
lang,
|
44 |
+
"hi",
|
45 |
+
).replace(" ् ", "्")
|
46 |
+
else:
|
47 |
+
# we only need to transliterate for joint training
|
48 |
+
return " ".join(
|
49 |
+
indic_tokenize.trivial_tokenize(normalizer.normalize(line.strip()), lang)
|
50 |
+
)
|
51 |
+
|
52 |
+
|
53 |
+
def preprocess(infname, outfname, lang, transliterate=False):
|
54 |
+
"""
|
55 |
+
Normalize, tokenize and script convert(for Indic)
|
56 |
+
return number of sentences input file
|
57 |
+
|
58 |
+
"""
|
59 |
+
|
60 |
+
n = 0
|
61 |
+
num_lines = sum(1 for line in open(infname, "r"))
|
62 |
+
if lang == "en":
|
63 |
+
with open(infname, "r", encoding="utf-8") as infile, open(
|
64 |
+
outfname, "w", encoding="utf-8"
|
65 |
+
) as outfile:
|
66 |
+
|
67 |
+
out_lines = Parallel(n_jobs=-1, backend="multiprocessing")(
|
68 |
+
delayed(preprocess_line)(line, None, lang)
|
69 |
+
for line in tqdm(infile, total=num_lines)
|
70 |
+
)
|
71 |
+
|
72 |
+
for line in out_lines:
|
73 |
+
outfile.write(line + "\n")
|
74 |
+
n += 1
|
75 |
+
|
76 |
+
else:
|
77 |
+
normfactory = indic_normalize.IndicNormalizerFactory()
|
78 |
+
normalizer = normfactory.get_normalizer(lang)
|
79 |
+
# reading
|
80 |
+
with open(infname, "r", encoding="utf-8") as infile, open(
|
81 |
+
outfname, "w", encoding="utf-8"
|
82 |
+
) as outfile:
|
83 |
+
|
84 |
+
out_lines = Parallel(n_jobs=-1, backend="multiprocessing")(
|
85 |
+
delayed(preprocess_line)(line, normalizer, lang, transliterate)
|
86 |
+
for line in tqdm(infile, total=num_lines)
|
87 |
+
)
|
88 |
+
|
89 |
+
for line in out_lines:
|
90 |
+
outfile.write(line + "\n")
|
91 |
+
n += 1
|
92 |
+
return n
|
93 |
+
|
94 |
+
|
95 |
+
def old_preprocess(infname, outfname, lang):
|
96 |
+
"""
|
97 |
+
Preparing each corpus file:
|
98 |
+
- Normalization
|
99 |
+
- Tokenization
|
100 |
+
- Script coversion to Devanagari for Indic scripts
|
101 |
+
"""
|
102 |
+
n = 0
|
103 |
+
num_lines = sum(1 for line in open(infname, "r"))
|
104 |
+
# reading
|
105 |
+
with open(infname, "r", encoding="utf-8") as infile, open(
|
106 |
+
outfname, "w", encoding="utf-8"
|
107 |
+
) as outfile:
|
108 |
+
|
109 |
+
if lang == "en":
|
110 |
+
en_tok = MosesTokenizer(lang="en")
|
111 |
+
en_normalizer = MosesPunctNormalizer()
|
112 |
+
for line in tqdm(infile, total=num_lines):
|
113 |
+
outline = " ".join(
|
114 |
+
en_tok.tokenize(en_normalizer.normalize(line.strip()), escape=False)
|
115 |
+
)
|
116 |
+
outfile.write(outline + "\n")
|
117 |
+
n += 1
|
118 |
+
|
119 |
+
else:
|
120 |
+
normfactory = indic_normalize.IndicNormalizerFactory()
|
121 |
+
normalizer = normfactory.get_normalizer(lang)
|
122 |
+
for line in tqdm(infile, total=num_lines):
|
123 |
+
outline = (
|
124 |
+
unicode_transliterate.UnicodeIndicTransliterator.transliterate(
|
125 |
+
" ".join(
|
126 |
+
indic_tokenize.trivial_tokenize(
|
127 |
+
normalizer.normalize(line.strip()), lang
|
128 |
+
)
|
129 |
+
),
|
130 |
+
lang,
|
131 |
+
"hi",
|
132 |
+
).replace(" ् ", "्")
|
133 |
+
)
|
134 |
+
|
135 |
+
outfile.write(outline + "\n")
|
136 |
+
n += 1
|
137 |
+
return n
|
138 |
+
|
139 |
+
|
140 |
+
if __name__ == "__main__":
|
141 |
+
|
142 |
+
# INDIC_NLP_LIB_HOME = "indic_nlp_library"
|
143 |
+
# INDIC_NLP_RESOURCES = "indic_nlp_resources"
|
144 |
+
# sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))
|
145 |
+
# common.set_resources_path(INDIC_NLP_RESOURCES)
|
146 |
+
|
147 |
+
# data_dir = '../joint_training/v1'
|
148 |
+
# new_dir = data_dir + '.norm'
|
149 |
+
# for path, subdirs, files in os.walk(data_dir):
|
150 |
+
# for name in files:
|
151 |
+
# infile = os.path.join(path, name)
|
152 |
+
# lang = infile.split('.')[-1]
|
153 |
+
# outfile = os.path.join(path.replace(data_dir, new_dir), name)
|
154 |
+
# preprocess(infile, outfile, lang)
|
155 |
+
# loader.load()
|
156 |
+
|
157 |
+
infname = sys.argv[1]
|
158 |
+
outfname = sys.argv[2]
|
159 |
+
lang = sys.argv[3]
|
160 |
+
|
161 |
+
if len(sys.argv) == 4:
|
162 |
+
transliterate = False
|
163 |
+
elif len(sys.argv) == 5:
|
164 |
+
transliterate = sys.argv[4]
|
165 |
+
if transliterate.lower() == "true":
|
166 |
+
transliterate = True
|
167 |
+
else:
|
168 |
+
transliterate = False
|
169 |
+
else:
|
170 |
+
print(f"Invalid arguments: {sys.argv}")
|
171 |
+
exit()
|
172 |
+
print(preprocess(infname, outfname, lang, transliterate))
|
scripts/remove_large_sentences.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tqdm import tqdm
|
2 |
+
import sys
|
3 |
+
|
4 |
+
|
5 |
+
def remove_large_sentences(src_path, tgt_path):
|
6 |
+
count = 0
|
7 |
+
new_src_lines = []
|
8 |
+
new_tgt_lines = []
|
9 |
+
src_num_lines = sum(1 for line in open(src_path, "r", encoding="utf-8"))
|
10 |
+
tgt_num_lines = sum(1 for line in open(tgt_path, "r", encoding="utf-8"))
|
11 |
+
assert src_num_lines == tgt_num_lines
|
12 |
+
with open(src_path, encoding="utf-8") as f1, open(tgt_path, encoding="utf-8") as f2:
|
13 |
+
for src_line, tgt_line in tqdm(zip(f1, f2), total=src_num_lines):
|
14 |
+
src_tokens = src_line.strip().split(" ")
|
15 |
+
tgt_tokens = tgt_line.strip().split(" ")
|
16 |
+
if len(src_tokens) > 200 or len(tgt_tokens) > 200:
|
17 |
+
count += 1
|
18 |
+
continue
|
19 |
+
new_src_lines.append(src_line)
|
20 |
+
new_tgt_lines.append(tgt_line)
|
21 |
+
return count, new_src_lines, new_tgt_lines
|
22 |
+
|
23 |
+
|
24 |
+
def create_txt(outFile, lines, add_newline=False):
|
25 |
+
outfile = open("{0}".format(outFile), "w", encoding="utf-8")
|
26 |
+
for line in lines:
|
27 |
+
if add_newline:
|
28 |
+
outfile.write(line + "\n")
|
29 |
+
else:
|
30 |
+
outfile.write(line)
|
31 |
+
outfile.close()
|
32 |
+
|
33 |
+
|
34 |
+
if __name__ == "__main__":
|
35 |
+
|
36 |
+
src_path = sys.argv[1]
|
37 |
+
tgt_path = sys.argv[2]
|
38 |
+
new_src_path = sys.argv[3]
|
39 |
+
new_tgt_path = sys.argv[4]
|
40 |
+
|
41 |
+
count, new_src_lines, new_tgt_lines = remove_large_sentences(src_path, tgt_path)
|
42 |
+
print(f'{count} lines removed due to seq_len > 200')
|
43 |
+
create_txt(new_src_path, new_src_lines)
|
44 |
+
create_txt(new_tgt_path, new_tgt_lines)
|
scripts/remove_train_devtest_overlaps.py
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import string
|
3 |
+
import shutil
|
4 |
+
from itertools import permutations, chain
|
5 |
+
from collections import defaultdict
|
6 |
+
from tqdm import tqdm
|
7 |
+
import sys
|
8 |
+
|
9 |
+
INDIC_LANGS = ["as", "bn", "gu", "hi", "kn", "ml", "mr", "or", "pa", "ta", "te"]
|
10 |
+
# we will be testing the overlaps of training data with all these benchmarks
|
11 |
+
# benchmarks = ['wat2021-devtest', 'wat2020-devtest', 'wat-2018', 'wmt-news', 'ufal-ta', 'pmi']
|
12 |
+
|
13 |
+
|
14 |
+
def read_lines(path):
|
15 |
+
# if path doesnt exist, return empty list
|
16 |
+
if not os.path.exists(path):
|
17 |
+
return []
|
18 |
+
with open(path, "r") as f:
|
19 |
+
lines = f.readlines()
|
20 |
+
return lines
|
21 |
+
|
22 |
+
|
23 |
+
def create_txt(outFile, lines):
|
24 |
+
add_newline = not "\n" in lines[0]
|
25 |
+
outfile = open("{0}".format(outFile), "w")
|
26 |
+
for line in lines:
|
27 |
+
if add_newline:
|
28 |
+
outfile.write(line + "\n")
|
29 |
+
else:
|
30 |
+
outfile.write(line)
|
31 |
+
|
32 |
+
outfile.close()
|
33 |
+
|
34 |
+
|
35 |
+
def pair_dedup_files(src_file, tgt_file):
|
36 |
+
src_lines = read_lines(src_file)
|
37 |
+
tgt_lines = read_lines(tgt_file)
|
38 |
+
len_before = len(src_lines)
|
39 |
+
|
40 |
+
src_dedupped, tgt_dedupped = pair_dedup_lists(src_lines, tgt_lines)
|
41 |
+
|
42 |
+
len_after = len(src_dedupped)
|
43 |
+
num_duplicates = len_before - len_after
|
44 |
+
|
45 |
+
print(f"Dropped duplicate pairs in {src_file} Num duplicates -> {num_duplicates}")
|
46 |
+
create_txt(src_file, src_dedupped)
|
47 |
+
create_txt(tgt_file, tgt_dedupped)
|
48 |
+
|
49 |
+
|
50 |
+
def pair_dedup_lists(src_list, tgt_list):
|
51 |
+
src_tgt = list(set(zip(src_list, tgt_list)))
|
52 |
+
src_deduped, tgt_deduped = zip(*src_tgt)
|
53 |
+
return src_deduped, tgt_deduped
|
54 |
+
|
55 |
+
|
56 |
+
def strip_and_normalize(line):
|
57 |
+
# lowercase line, remove spaces and strip punctuation
|
58 |
+
|
59 |
+
# one of the fastest way to add an exclusion list and remove that
|
60 |
+
# list of characters from a string
|
61 |
+
# https://towardsdatascience.com/how-to-efficiently-remove-punctuations-from-a-string-899ad4a059fb
|
62 |
+
exclist = string.punctuation + "\u0964"
|
63 |
+
table_ = str.maketrans("", "", exclist)
|
64 |
+
|
65 |
+
line = line.replace(" ", "").lower()
|
66 |
+
# dont use this method, it is painfully slow
|
67 |
+
# line = "".join([i for i in line if i not in string.punctuation])
|
68 |
+
line = line.translate(table_)
|
69 |
+
return line
|
70 |
+
|
71 |
+
|
72 |
+
def expand_tupled_list(list_of_tuples):
|
73 |
+
# convert list of tuples into two lists
|
74 |
+
# https://stackoverflow.com/questions/8081545/how-to-convert-list-of-tuples-to-multiple-lists
|
75 |
+
# [(en, as), (as, bn), (bn, gu)] - > [en, as, bn], [as, bn, gu]
|
76 |
+
list_a, list_b = map(list, zip(*list_of_tuples))
|
77 |
+
return list_a, list_b
|
78 |
+
|
79 |
+
|
80 |
+
def get_src_tgt_lang_lists(many2many=False):
|
81 |
+
if many2many is False:
|
82 |
+
SRC_LANGS = ["en"]
|
83 |
+
TGT_LANGS = INDIC_LANGS
|
84 |
+
else:
|
85 |
+
all_languages = INDIC_LANGS + ["en"]
|
86 |
+
# lang_pairs = list(permutations(all_languages, 2))
|
87 |
+
|
88 |
+
SRC_LANGS, TGT_LANGS = all_languages, all_languages
|
89 |
+
|
90 |
+
return SRC_LANGS, TGT_LANGS
|
91 |
+
|
92 |
+
|
93 |
+
def normalize_and_gather_all_benchmarks(devtest_dir, many2many=False):
|
94 |
+
|
95 |
+
# This is a dict of dict of lists
|
96 |
+
# the first keys are for lang-pair, the second keys are for src/tgt
|
97 |
+
# the values are the devtest lines.
|
98 |
+
# so devtest_pairs_normalized[en-as][src] will store src(en lines)
|
99 |
+
# so devtest_pairs_normalized[en-as][tgt] will store tgt(as lines)
|
100 |
+
devtest_pairs_normalized = defaultdict(lambda: defaultdict(list))
|
101 |
+
SRC_LANGS, TGT_LANGS = get_src_tgt_lang_lists(many2many)
|
102 |
+
benchmarks = os.listdir(devtest_dir)
|
103 |
+
for dataset in benchmarks:
|
104 |
+
for src_lang in SRC_LANGS:
|
105 |
+
for tgt_lang in TGT_LANGS:
|
106 |
+
if src_lang == tgt_lang:
|
107 |
+
continue
|
108 |
+
if dataset == "wat2021-devtest":
|
109 |
+
# wat2021 dev and test sets have differnet folder structure
|
110 |
+
src_dev = read_lines(f"{devtest_dir}/{dataset}/dev.{src_lang}")
|
111 |
+
tgt_dev = read_lines(f"{devtest_dir}/{dataset}/dev.{tgt_lang}")
|
112 |
+
src_test = read_lines(f"{devtest_dir}/{dataset}/test.{src_lang}")
|
113 |
+
tgt_test = read_lines(f"{devtest_dir}/{dataset}/test.{tgt_lang}")
|
114 |
+
else:
|
115 |
+
src_dev = read_lines(
|
116 |
+
f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/dev.{src_lang}"
|
117 |
+
)
|
118 |
+
tgt_dev = read_lines(
|
119 |
+
f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/dev.{tgt_lang}"
|
120 |
+
)
|
121 |
+
src_test = read_lines(
|
122 |
+
f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/test.{src_lang}"
|
123 |
+
)
|
124 |
+
tgt_test = read_lines(
|
125 |
+
f"{devtest_dir}/{dataset}/{src_lang}-{tgt_lang}/test.{tgt_lang}"
|
126 |
+
)
|
127 |
+
|
128 |
+
# if the tgt_pair data doesnt exist for a particular test set,
|
129 |
+
# it will be an empty list
|
130 |
+
if tgt_test == [] or tgt_dev == []:
|
131 |
+
# print(f'{dataset} does not have {src_lang}-{tgt_lang} data')
|
132 |
+
continue
|
133 |
+
|
134 |
+
# combine both dev and test sets into one
|
135 |
+
src_devtest = src_dev + src_test
|
136 |
+
tgt_devtest = tgt_dev + tgt_test
|
137 |
+
|
138 |
+
src_devtest = [strip_and_normalize(line) for line in src_devtest]
|
139 |
+
tgt_devtest = [strip_and_normalize(line) for line in tgt_devtest]
|
140 |
+
|
141 |
+
devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"].extend(
|
142 |
+
src_devtest
|
143 |
+
)
|
144 |
+
devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"].extend(
|
145 |
+
tgt_devtest
|
146 |
+
)
|
147 |
+
|
148 |
+
# dedup merged benchmark datasets
|
149 |
+
for src_lang in SRC_LANGS:
|
150 |
+
for tgt_lang in TGT_LANGS:
|
151 |
+
if src_lang == tgt_lang:
|
152 |
+
continue
|
153 |
+
src_devtest, tgt_devtest = (
|
154 |
+
devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"],
|
155 |
+
devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"],
|
156 |
+
)
|
157 |
+
# if the devtest data doesnt exist for the src-tgt pair then continue
|
158 |
+
if src_devtest == [] or tgt_devtest == []:
|
159 |
+
continue
|
160 |
+
src_devtest, tgt_devtest = pair_dedup_lists(src_devtest, tgt_devtest)
|
161 |
+
(
|
162 |
+
devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["src"],
|
163 |
+
devtest_pairs_normalized[f"{src_lang}-{tgt_lang}"]["tgt"],
|
164 |
+
) = (
|
165 |
+
src_devtest,
|
166 |
+
tgt_devtest,
|
167 |
+
)
|
168 |
+
|
169 |
+
return devtest_pairs_normalized
|
170 |
+
|
171 |
+
|
172 |
+
def remove_train_devtest_overlaps(train_dir, devtest_dir, many2many=False):
|
173 |
+
|
174 |
+
devtest_pairs_normalized = normalize_and_gather_all_benchmarks(
|
175 |
+
devtest_dir, many2many
|
176 |
+
)
|
177 |
+
|
178 |
+
SRC_LANGS, TGT_LANGS = get_src_tgt_lang_lists(many2many)
|
179 |
+
|
180 |
+
if not many2many:
|
181 |
+
all_src_sentences_normalized = []
|
182 |
+
for key in devtest_pairs_normalized:
|
183 |
+
all_src_sentences_normalized.extend(devtest_pairs_normalized[key]["src"])
|
184 |
+
# remove all duplicates. Now this contains all the normalized
|
185 |
+
# english sentences in all test benchmarks across all lang pair
|
186 |
+
all_src_sentences_normalized = list(set(all_src_sentences_normalized))
|
187 |
+
else:
|
188 |
+
all_src_sentences_normalized = None
|
189 |
+
|
190 |
+
src_overlaps = []
|
191 |
+
tgt_overlaps = []
|
192 |
+
for src_lang in SRC_LANGS:
|
193 |
+
for tgt_lang in TGT_LANGS:
|
194 |
+
if src_lang == tgt_lang:
|
195 |
+
continue
|
196 |
+
new_src_train = []
|
197 |
+
new_tgt_train = []
|
198 |
+
|
199 |
+
pair = f"{src_lang}-{tgt_lang}"
|
200 |
+
src_train = read_lines(f"{train_dir}/{pair}/train.{src_lang}")
|
201 |
+
tgt_train = read_lines(f"{train_dir}/{pair}/train.{tgt_lang}")
|
202 |
+
|
203 |
+
len_before = len(src_train)
|
204 |
+
if len_before == 0:
|
205 |
+
continue
|
206 |
+
|
207 |
+
src_train_normalized = [strip_and_normalize(line) for line in src_train]
|
208 |
+
tgt_train_normalized = [strip_and_normalize(line) for line in tgt_train]
|
209 |
+
|
210 |
+
if all_src_sentences_normalized:
|
211 |
+
src_devtest_normalized = all_src_sentences_normalized
|
212 |
+
else:
|
213 |
+
src_devtest_normalized = devtest_pairs_normalized[pair]["src"]
|
214 |
+
|
215 |
+
tgt_devtest_normalized = devtest_pairs_normalized[pair]["tgt"]
|
216 |
+
|
217 |
+
# compute all src and tgt super strict overlaps for a lang pair
|
218 |
+
overlaps = set(src_train_normalized) & set(src_devtest_normalized)
|
219 |
+
src_overlaps.extend(list(overlaps))
|
220 |
+
|
221 |
+
overlaps = set(tgt_train_normalized) & set(tgt_devtest_normalized)
|
222 |
+
tgt_overlaps.extend(list(overlaps))
|
223 |
+
# dictionaries offer o(1) lookup
|
224 |
+
src_overlaps_dict = {}
|
225 |
+
tgt_overlaps_dict = {}
|
226 |
+
for line in src_overlaps:
|
227 |
+
src_overlaps_dict[line] = 1
|
228 |
+
for line in tgt_overlaps:
|
229 |
+
tgt_overlaps_dict[line] = 1
|
230 |
+
|
231 |
+
# loop to remove the ovelapped data
|
232 |
+
idx = -1
|
233 |
+
for src_line_norm, tgt_line_norm in tqdm(
|
234 |
+
zip(src_train_normalized, tgt_train_normalized), total=len_before
|
235 |
+
):
|
236 |
+
idx += 1
|
237 |
+
if src_overlaps_dict.get(src_line_norm, None):
|
238 |
+
continue
|
239 |
+
if tgt_overlaps_dict.get(tgt_line_norm, None):
|
240 |
+
continue
|
241 |
+
new_src_train.append(src_train[idx])
|
242 |
+
new_tgt_train.append(tgt_train[idx])
|
243 |
+
|
244 |
+
len_after = len(new_src_train)
|
245 |
+
print(
|
246 |
+
f"Detected overlaps between train and devetest for {pair} is {len_before - len_after}"
|
247 |
+
)
|
248 |
+
print(f"saving new files at {train_dir}/{pair}/")
|
249 |
+
create_txt(f"{train_dir}/{pair}/train.{src_lang}", new_src_train)
|
250 |
+
create_txt(f"{train_dir}/{pair}/train.{tgt_lang}", new_tgt_train)
|
251 |
+
|
252 |
+
|
253 |
+
if __name__ == "__main__":
|
254 |
+
train_data_dir = sys.argv[1]
|
255 |
+
# benchmarks directory should contains all the test sets
|
256 |
+
devtest_data_dir = sys.argv[2]
|
257 |
+
if len(sys.argv) == 3:
|
258 |
+
many2many = False
|
259 |
+
elif len(sys.argv) == 4:
|
260 |
+
many2many = sys.argv[4]
|
261 |
+
if many2many.lower() == "true":
|
262 |
+
many2many = True
|
263 |
+
else:
|
264 |
+
many2many = False
|
265 |
+
remove_train_devtest_overlaps(train_data_dir, devtest_data_dir, many2many)
|