NMTKD
/
translation
/tools
/mosesdecoder
/contrib
/arrow-pipelines
/pcl
/components
/wrappers
/cleanup
/cleanup.py
def get_name(): | |
return 'cleanup' | |
def get_inputs(): | |
return ['src_filename', 'trg_filename'] | |
def get_outputs(): | |
return ['cleaned_src_filename', 'cleaned_trg_filename'] | |
def get_configuration(): | |
return ['segment_length_limit'] | |
def configure(args): | |
return {'segment_length' : args['segment_length_limit']} | |
def initialise(config): | |
def _filter(limit, ifh1, ofh1, ifh2, ofh2): | |
def _short(line): | |
n = 0 | |
for c in line: | |
if c == " ": | |
n += 1 | |
return n < limit | |
for (l1, l2) in zip(ifh1, ifh2): | |
if _short(l1) and _short(l2): | |
print >>ofh1, l1, | |
print >>ofh2, l2, | |
def _make_cleaned_filename(filename): | |
bits = filename.split(".") | |
bits.insert(-1, "clean") | |
return ".".join(bits) | |
def _filter_main(a, s): | |
limit = config['segment_length'] | |
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None) | |
try: | |
input_src_filename = a['src_filename'] | |
input_trg_filename = a['trg_filename'] | |
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename) | |
ifh1 = open(input_src_filename, "r") | |
ifh2 = open(input_trg_filename, "r") | |
cleaned_src_filename = _make_cleaned_filename(input_src_filename) | |
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename) | |
ofh1 = open(cleaned_src_filename, "w") | |
ofh2 = open(cleaned_trg_filename, "w") | |
_filter(limit, ifh1, ofh1, ifh2, ofh2) | |
return {'cleaned_src_filename': cleaned_src_filename, | |
'cleaned_trg_filename': cleaned_trg_filename} | |
finally: | |
def _safe_close(fh): | |
if fh is not None: | |
fh.close() | |
_safe_close(ifh1) | |
_safe_close(ifh2) | |
_safe_close(ofh1) | |
_safe_close(ofh2) | |
return _filter_main | |
if __name__ == '__main__': | |
import os | |
import tempfile | |
import test.test as thelp | |
from pypeline.helpers.helpers import eval_pipeline | |
def _test_main(): | |
configuration = {'segment_length_limit': 20} | |
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp") | |
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp") | |
box_eval = { | |
'src_filename': src_filename[1], | |
'trg_filename': trg_filename[1], | |
'cleaned_src_file_expected': src_filename[1] + ".expected", | |
'cleaned_trg_file_expected': trg_filename[1] + ".expected"} | |
try: | |
_prep_files(box_eval) | |
_run_test(configuration, box_eval) | |
finally: | |
_cleanup_files(box_eval) | |
def _run_test(configuration, box_eval): | |
box_config = configure(configuration) | |
box = initialise(box_config) | |
output = eval_pipeline(box, box_eval, box_config) | |
try: | |
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename']) | |
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename']) | |
finally: | |
os.unlink(output['cleaned_src_filename']) | |
os.unlink(output['cleaned_trg_filename']) | |
def _line(line_lengths): | |
def _gen_line(tokens): | |
return " ".join(map(lambda n: "tok" + str(n), range(tokens))) | |
return map(_gen_line, line_lengths) | |
def _prep_files(box_eval): | |
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21])) | |
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21])) | |
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17])) | |
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20])) | |
def _cleanup_files(box_eval): | |
try: | |
for key, filename in box_eval.items(): | |
os.unlink(filename) | |
except: | |
pass | |
_test_main() | |