sakharamg's picture
Uploading all files
158b61b
def get_name():
return 'cleanup'
def get_inputs():
return ['src_filename', 'trg_filename']
def get_outputs():
return ['cleaned_src_filename', 'cleaned_trg_filename']
def get_configuration():
return ['segment_length_limit']
def configure(args):
return {'segment_length' : args['segment_length_limit']}
def initialise(config):
def _filter(limit, ifh1, ofh1, ifh2, ofh2):
def _short(line):
n = 0
for c in line:
if c == " ":
n += 1
return n < limit
for (l1, l2) in zip(ifh1, ifh2):
if _short(l1) and _short(l2):
print >>ofh1, l1,
print >>ofh2, l2,
def _make_cleaned_filename(filename):
bits = filename.split(".")
bits.insert(-1, "clean")
return ".".join(bits)
def _filter_main(a, s):
limit = config['segment_length']
(ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
try:
input_src_filename = a['src_filename']
input_trg_filename = a['trg_filename']
print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
ifh1 = open(input_src_filename, "r")
ifh2 = open(input_trg_filename, "r")
cleaned_src_filename = _make_cleaned_filename(input_src_filename)
cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
ofh1 = open(cleaned_src_filename, "w")
ofh2 = open(cleaned_trg_filename, "w")
_filter(limit, ifh1, ofh1, ifh2, ofh2)
return {'cleaned_src_filename': cleaned_src_filename,
'cleaned_trg_filename': cleaned_trg_filename}
finally:
def _safe_close(fh):
if fh is not None:
fh.close()
_safe_close(ifh1)
_safe_close(ifh2)
_safe_close(ofh1)
_safe_close(ofh2)
return _filter_main
if __name__ == '__main__':
import os
import tempfile
import test.test as thelp
from pypeline.helpers.helpers import eval_pipeline
def _test_main():
configuration = {'segment_length_limit': 20}
src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
box_eval = {
'src_filename': src_filename[1],
'trg_filename': trg_filename[1],
'cleaned_src_file_expected': src_filename[1] + ".expected",
'cleaned_trg_file_expected': trg_filename[1] + ".expected"}
try:
_prep_files(box_eval)
_run_test(configuration, box_eval)
finally:
_cleanup_files(box_eval)
def _run_test(configuration, box_eval):
box_config = configure(configuration)
box = initialise(box_config)
output = eval_pipeline(box, box_eval, box_config)
try:
thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
finally:
os.unlink(output['cleaned_src_filename'])
os.unlink(output['cleaned_trg_filename'])
def _line(line_lengths):
def _gen_line(tokens):
return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
return map(_gen_line, line_lengths)
def _prep_files(box_eval):
thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
def _cleanup_files(box_eval):
try:
for key, filename in box_eval.items():
os.unlink(filename)
except:
pass
_test_main()