File size: 7,720 Bytes
d916065
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
# Contributors: Ozan Caglayan, Wiktor Stribizew
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT

"""

This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,

https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926

which was also ported into Python in

https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162

"""


import io
import re

from nltk.corpus import perluniprops
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import xml_unescape


class NISTTokenizer(TokenizerI):
    """

    This NIST tokenizer is sentence-based instead of the original

    paragraph-based tokenization from mteval-14.pl; The sentence-based

    tokenization is consistent with the other tokenizers available in NLTK.



    >>> from nltk.tokenize.nist import NISTTokenizer

    >>> nist = NISTTokenizer()

    >>> s = "Good muffins cost $3.88 in New York."

    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']

    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']

    >>> nist.tokenize(s, lowercase=False) == expected_cased

    True

    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.

    True



    The international_tokenize() is the preferred function when tokenizing

    non-european text, e.g.



    >>> from nltk.tokenize.nist import NISTTokenizer

    >>> nist = NISTTokenizer()



    # Input strings.

    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'

    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'

    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'



    # Expected tokens.

    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']

    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']

    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']



    >>> nist.international_tokenize(albb)[:10] == expected_albb

    True

    >>> nist.international_tokenize(amz)[:10] == expected_amz

    True

    >>> nist.international_tokenize(rkt)[:10] == expected_rkt

    True



    # Doctest for patching issue #1926

    >>> sent = u'this is a foo\u2604sentence.'

    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']

    >>> nist.international_tokenize(sent) == expected_sent

    True

    """

    # Strip "skipped" tags
    STRIP_SKIP = re.compile("<skipped>"), ""
    #  Strip end-of-line hyphenation and join lines
    STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
    # Tokenize punctuation.
    PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
    # Tokenize period and comma unless preceded by a digit.
    PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 "
    # Tokenize period and comma unless followed by a digit.
    PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2"
    # Tokenize dash when preceded by a digit
    DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "

    LANG_DEPENDENT_REGEXES = [
        PUNCT,
        PERIOD_COMMA_PRECEED,
        PERIOD_COMMA_FOLLOW,
        DASH_PRECEED_DIGIT,
    ]

    # Perluniprops characters used in NIST tokenizer.
    pup_number = str("".join(set(perluniprops.chars("Number"))))  # i.e. \p{N}
    pup_punct = str("".join(set(perluniprops.chars("Punctuation"))))  # i.e. \p{P}
    pup_symbol = str("".join(set(perluniprops.chars("Symbol"))))  # i.e. \p{S}

    # Python regexes needs to escape some special symbols, see
    # see https://stackoverflow.com/q/45670950/610569
    number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
    punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
    symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)

    # Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
    #       (i) strip trailing and heading spaces  and
    #       (ii) de-deuplicate spaces.
    #       In Python, this would do: ' '.join(str.strip().split())
    # Thus, the next two lines were commented out.
    # Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
    # Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}

    # Pads non-ascii strings with space.
    NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
    #  Tokenize any punctuation unless followed AND preceded by a digit.
    PUNCT_1 = (
        re.compile(f"([{number_regex}])([{punct_regex}])"),
        "\\1 \\2 ",
    )
    PUNCT_2 = (
        re.compile(f"([{punct_regex}])([{number_regex}])"),
        " \\1 \\2",
    )
    # Tokenize symbols
    SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 "

    INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]

    def lang_independent_sub(self, text):
        """Performs the language independent string substituitions."""
        # It's a strange order of regexes.
        # It'll be better to unescape after STRIP_EOL_HYPHEN
        # but let's keep it close to the original NIST implementation.
        regexp, substitution = self.STRIP_SKIP
        text = regexp.sub(substitution, text)
        text = xml_unescape(text)
        regexp, substitution = self.STRIP_EOL_HYPHEN
        text = regexp.sub(substitution, text)
        return text

    def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
        text = str(text)
        # Language independent regex.
        text = self.lang_independent_sub(text)
        # Language dependent regex.
        if western_lang:
            # Pad string with whitespace.
            text = " " + text + " "
            if lowercase:
                text = text.lower()
            for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
                text = regexp.sub(substitution, text)
        # Remove contiguous whitespaces.
        text = " ".join(text.split())
        # Finally, strips heading and trailing spaces
        # and converts output string into unicode.
        text = str(text.strip())
        return text if return_str else text.split()

    def international_tokenize(

        self, text, lowercase=False, split_non_ascii=True, return_str=False

    ):
        text = str(text)
        # Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
        # first before unescaping.
        regexp, substitution = self.STRIP_SKIP
        text = regexp.sub(substitution, text)
        regexp, substitution = self.STRIP_EOL_HYPHEN
        text = regexp.sub(substitution, text)
        text = xml_unescape(text)

        if lowercase:
            text = text.lower()

        for regexp, substitution in self.INTERNATIONAL_REGEXES:
            text = regexp.sub(substitution, text)

        # Make sure that there's only one space only between words.
        # Strip leading and trailing spaces.
        text = " ".join(text.strip().split())
        return text if return_str else text.split()