File size: 3,337 Bytes
26fd00c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import argparse
from itertools import zip_longest


def replace_oovs(source_in, target_in, vocabulary, source_out, target_out):
    """Replaces out-of-vocabulary words in source and target text with <unk-N>,
    where N in is the position of the word in the source sequence.
    """

    def format_unk(pos):
        return "<unk-{}>".format(pos)

    if target_in is None:
        target_in = []

    for seq_num, (source_seq, target_seq) in enumerate(
        zip_longest(source_in, target_in)
    ):
        source_seq_out = []
        target_seq_out = []

        word_to_pos = dict()
        for position, token in enumerate(source_seq.strip().split()):
            if token in vocabulary:
                token_out = token
            else:
                if token in word_to_pos:
                    oov_pos = word_to_pos[token]
                else:
                    word_to_pos[token] = position
                    oov_pos = position
                token_out = format_unk(oov_pos)
            source_seq_out.append(token_out)
        source_out.write(" ".join(source_seq_out) + "\n")

        if target_seq is not None:
            for token in target_seq.strip().split():
                if token in word_to_pos:
                    token_out = format_unk(word_to_pos[token])
                else:
                    token_out = token
                target_seq_out.append(token_out)
        if target_out is not None:
            target_out.write(" ".join(target_seq_out) + "\n")


def main():
    parser = argparse.ArgumentParser(
        description="Replaces out-of-vocabulary words in both source and target "
        "sequences with tokens that indicate the position of the word "
        "in the source sequence."
    )
    parser.add_argument(
        "--source", type=str, help="text file with source sequences", required=True
    )
    parser.add_argument(
        "--target", type=str, help="text file with target sequences", default=None
    )
    parser.add_argument("--vocab", type=str, help="vocabulary file", required=True)
    parser.add_argument(
        "--source-out",
        type=str,
        help="where to write source sequences with <unk-N> entries",
        required=True,
    )
    parser.add_argument(
        "--target-out",
        type=str,
        help="where to write target sequences with <unk-N> entries",
        default=None,
    )
    args = parser.parse_args()

    with open(args.vocab, encoding="utf-8") as vocab:
        vocabulary = vocab.read().splitlines()

    target_in = (
        open(args.target, "r", encoding="utf-8") if args.target is not None else None
    )
    target_out = (
        open(args.target_out, "w", encoding="utf-8")
        if args.target_out is not None
        else None
    )
    with open(args.source, "r", encoding="utf-8") as source_in, open(
        args.source_out, "w", encoding="utf-8"
    ) as source_out:
        replace_oovs(source_in, target_in, vocabulary, source_out, target_out)
    if target_in is not None:
        target_in.close()
    if target_out is not None:
        target_out.close()


if __name__ == "__main__":
    main()