Spaces:
Sleeping
Sleeping
File size: 7,720 Bytes
d916065 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 |
# Natural Language Toolkit: Python port of the mteval-v14.pl tokenizer.
#
# Copyright (C) 2001-2015 NLTK Project
# Author: Liling Tan (ported from ftp://jaguar.ncsl.nist.gov/mt/resources/mteval-v14.pl)
# Contributors: Ozan Caglayan, Wiktor Stribizew
#
# URL: <https://www.nltk.org>
# For license information, see LICENSE.TXT
"""
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
"""
import io
import re
from nltk.corpus import perluniprops
from nltk.tokenize.api import TokenizerI
from nltk.tokenize.util import xml_unescape
class NISTTokenizer(TokenizerI):
"""
This NIST tokenizer is sentence-based instead of the original
paragraph-based tokenization from mteval-14.pl; The sentence-based
tokenization is consistent with the other tokenizers available in NLTK.
>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()
>>> s = "Good muffins cost $3.88 in New York."
>>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
>>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
>>> nist.tokenize(s, lowercase=False) == expected_cased
True
>>> nist.tokenize(s, lowercase=True) == expected_lower # Lowercased.
True
The international_tokenize() is the preferred function when tokenizing
non-european text, e.g.
>>> from nltk.tokenize.nist import NISTTokenizer
>>> nist = NISTTokenizer()
# Input strings.
>>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'
>>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
>>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'
# Expected tokens.
>>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'\u963f\u91cc\u5df4\u5df4\u96c6\u56e2\u63a7\u80a1', u'\u6709\u9650\u516c\u53f8', u')']
>>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'\u02c8\xe6', u'm']
>>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'\u697d\u5929\u682a\u5f0f\u4f1a\u793e', u'Rakuten', u'Kabushiki', u'-', u'gaisha']
>>> nist.international_tokenize(albb)[:10] == expected_albb
True
>>> nist.international_tokenize(amz)[:10] == expected_amz
True
>>> nist.international_tokenize(rkt)[:10] == expected_rkt
True
# Doctest for patching issue #1926
>>> sent = u'this is a foo\u2604sentence.'
>>> expected_sent = [u'this', u'is', u'a', u'foo', u'\u2604', u'sentence', u'.']
>>> nist.international_tokenize(sent) == expected_sent
True
"""
# Strip "skipped" tags
STRIP_SKIP = re.compile("<skipped>"), ""
# Strip end-of-line hyphenation and join lines
STRIP_EOL_HYPHEN = re.compile("\u2028"), " "
# Tokenize punctuation.
PUNCT = re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), " \\1 "
# Tokenize period and comma unless preceded by a digit.
PERIOD_COMMA_PRECEED = re.compile(r"([^0-9])([\.,])"), "\\1 \\2 "
# Tokenize period and comma unless followed by a digit.
PERIOD_COMMA_FOLLOW = re.compile(r"([\.,])([^0-9])"), " \\1 \\2"
# Tokenize dash when preceded by a digit
DASH_PRECEED_DIGIT = re.compile("([0-9])(-)"), "\\1 \\2 "
LANG_DEPENDENT_REGEXES = [
PUNCT,
PERIOD_COMMA_PRECEED,
PERIOD_COMMA_FOLLOW,
DASH_PRECEED_DIGIT,
]
# Perluniprops characters used in NIST tokenizer.
pup_number = str("".join(set(perluniprops.chars("Number")))) # i.e. \p{N}
pup_punct = str("".join(set(perluniprops.chars("Punctuation")))) # i.e. \p{P}
pup_symbol = str("".join(set(perluniprops.chars("Symbol")))) # i.e. \p{S}
# Python regexes needs to escape some special symbols, see
# see https://stackoverflow.com/q/45670950/610569
number_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_number)
punct_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_punct)
symbol_regex = re.sub(r"[]^\\-]", r"\\\g<0>", pup_symbol)
# Note: In the original perl implementation, \p{Z} and \p{Zl} were used to
# (i) strip trailing and heading spaces and
# (ii) de-deuplicate spaces.
# In Python, this would do: ' '.join(str.strip().split())
# Thus, the next two lines were commented out.
# Line_Separator = str(''.join(perluniprops.chars('Line_Separator'))) # i.e. \p{Zl}
# Separator = str(''.join(perluniprops.chars('Separator'))) # i.e. \p{Z}
# Pads non-ascii strings with space.
NONASCII = re.compile("([\x00-\x7f]+)"), r" \1 "
# Tokenize any punctuation unless followed AND preceded by a digit.
PUNCT_1 = (
re.compile(f"([{number_regex}])([{punct_regex}])"),
"\\1 \\2 ",
)
PUNCT_2 = (
re.compile(f"([{punct_regex}])([{number_regex}])"),
" \\1 \\2",
)
# Tokenize symbols
SYMBOLS = re.compile(f"([{symbol_regex}])"), " \\1 "
INTERNATIONAL_REGEXES = [NONASCII, PUNCT_1, PUNCT_2, SYMBOLS]
def lang_independent_sub(self, text):
"""Performs the language independent string substituitions."""
# It's a strange order of regexes.
# It'll be better to unescape after STRIP_EOL_HYPHEN
# but let's keep it close to the original NIST implementation.
regexp, substitution = self.STRIP_SKIP
text = regexp.sub(substitution, text)
text = xml_unescape(text)
regexp, substitution = self.STRIP_EOL_HYPHEN
text = regexp.sub(substitution, text)
return text
def tokenize(self, text, lowercase=False, western_lang=True, return_str=False):
text = str(text)
# Language independent regex.
text = self.lang_independent_sub(text)
# Language dependent regex.
if western_lang:
# Pad string with whitespace.
text = " " + text + " "
if lowercase:
text = text.lower()
for regexp, substitution in self.LANG_DEPENDENT_REGEXES:
text = regexp.sub(substitution, text)
# Remove contiguous whitespaces.
text = " ".join(text.split())
# Finally, strips heading and trailing spaces
# and converts output string into unicode.
text = str(text.strip())
return text if return_str else text.split()
def international_tokenize(
self, text, lowercase=False, split_non_ascii=True, return_str=False
):
text = str(text)
# Different from the 'normal' tokenize(), STRIP_EOL_HYPHEN is applied
# first before unescaping.
regexp, substitution = self.STRIP_SKIP
text = regexp.sub(substitution, text)
regexp, substitution = self.STRIP_EOL_HYPHEN
text = regexp.sub(substitution, text)
text = xml_unescape(text)
if lowercase:
text = text.lower()
for regexp, substitution in self.INTERNATIONAL_REGEXES:
text = regexp.sub(substitution, text)
# Make sure that there's only one space only between words.
# Strip leading and trailing spaces.
text = " ".join(text.strip().split())
return text if return_str else text.split()
|