File size: 4,026 Bytes
158b61b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def get_name():
    return 'cleanup'

def get_inputs():
    return ['src_filename', 'trg_filename']

def get_outputs():
    return ['cleaned_src_filename', 'cleaned_trg_filename']

def get_configuration():
    return ['segment_length_limit']

def configure(args):
    return {'segment_length' : args['segment_length_limit']}

def initialise(config):
    def _filter(limit, ifh1, ofh1, ifh2, ofh2):
        def _short(line):
            n = 0
            for c in line:
                if c == " ":
                    n += 1
            return n < limit

        for (l1, l2) in zip(ifh1, ifh2):
            if _short(l1) and _short(l2):
                print >>ofh1, l1,
                print >>ofh2, l2,

    def _make_cleaned_filename(filename):
        bits = filename.split(".")
        bits.insert(-1, "clean")
        return ".".join(bits)

    def _filter_main(a, s):
        limit = config['segment_length']
        (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
        try:
            input_src_filename = a['src_filename']
            input_trg_filename = a['trg_filename']

            print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)

            ifh1 = open(input_src_filename, "r")
            ifh2 = open(input_trg_filename, "r")

            cleaned_src_filename = _make_cleaned_filename(input_src_filename)
            cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
            ofh1 = open(cleaned_src_filename, "w")
            ofh2 = open(cleaned_trg_filename, "w")

            _filter(limit, ifh1, ofh1, ifh2, ofh2)

            return {'cleaned_src_filename': cleaned_src_filename,
                    'cleaned_trg_filename': cleaned_trg_filename}
        finally:
            def _safe_close(fh):
                if fh is not None:
                    fh.close()
                _safe_close(ifh1)
                _safe_close(ifh2)
                _safe_close(ofh1)
                _safe_close(ofh2)
    
    return _filter_main


if __name__ == '__main__':
    import os
    import tempfile
    import test.test as thelp

    from pypeline.helpers.helpers import eval_pipeline


    def _test_main():
        configuration = {'segment_length_limit': 20}

        src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
        trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")

        box_eval = {
            'src_filename': src_filename[1],
            'trg_filename': trg_filename[1],
            'cleaned_src_file_expected': src_filename[1] + ".expected",
            'cleaned_trg_file_expected': trg_filename[1] + ".expected"}

        try:
            _prep_files(box_eval)
            _run_test(configuration, box_eval)
        finally:
            _cleanup_files(box_eval)


    def _run_test(configuration, box_eval):
        box_config = configure(configuration)
        box = initialise(box_config)
    
        output = eval_pipeline(box, box_eval, box_config)
        try:
            thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
            thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
        finally:
            os.unlink(output['cleaned_src_filename'])
            os.unlink(output['cleaned_trg_filename'])


    def _line(line_lengths):
        def _gen_line(tokens):
            return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
        return map(_gen_line, line_lengths)


    def _prep_files(box_eval):
        thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
        thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
        thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
        thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))


        def _cleanup_files(box_eval):
            try:
                for key, filename in box_eval.items():
                    os.unlink(filename)
            except:
                pass


    _test_main()