Spaces:
Build error
Build error
File size: 3,815 Bytes
e62781a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 |
#!/usr/bin/env python3
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Regex based tokenizer that emulates the Stanford/NLTK PTB tokenizers.
However it is purely in Python, supports robust untokenization, unicode,
and requires minimal dependencies.
"""
import regex
import logging
from .tokenizer import Tokens, Tokenizer
logger = logging.getLogger(__name__)
class RegexpTokenizer(Tokenizer):
DIGIT = r'\p{Nd}+([:\.\,]\p{Nd}+)*'
TITLE = (r'(dr|esq|hon|jr|mr|mrs|ms|prof|rev|sr|st|rt|messrs|mmes|msgr)'
r'\.(?=\p{Z})')
ABBRV = r'([\p{L}]\.){2,}(?=\p{Z}|$)'
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]++'
HYPHEN = r'{A}([-\u058A\u2010\u2011]{A})+'.format(A=ALPHA_NUM)
NEGATION = r"((?!n't)[\p{L}\p{N}\p{M}])++(?=n't)|n't"
CONTRACTION1 = r"can(?=not\b)"
CONTRACTION2 = r"'([tsdm]|re|ll|ve)\b"
START_DQUOTE = r'(?<=[\p{Z}\(\[{<]|^)(``|["\u0093\u201C\u00AB])(?!\p{Z})'
START_SQUOTE = r'(?<=[\p{Z}\(\[{<]|^)[\'\u0091\u2018\u201B\u2039](?!\p{Z})'
END_DQUOTE = r'(?<!\p{Z})(\'\'|["\u0094\u201D\u00BB])'
END_SQUOTE = r'(?<!\p{Z})[\'\u0092\u2019\u203A]'
DASH = r'--|[\u0096\u0097\u2013\u2014\u2015]'
ELLIPSES = r'\.\.\.|\u2026'
PUNCT = r'\p{P}'
NON_WS = r'[^\p{Z}\p{C}]'
def __init__(self, **kwargs):
"""
Args:
annotators: None or empty set (only tokenizes).
substitutions: if true, normalizes some token types (e.g. quotes).
"""
self._regexp = regex.compile(
'(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|'
'(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|'
'(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|'
'(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' %
(self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
)
if len(kwargs.get('annotators', {})) > 0:
logger.warning('%s only tokenizes! Skipping annotators: %s' %
(type(self).__name__, kwargs.get('annotators')))
self.annotators = set()
self.substitutions = kwargs.get('substitutions', True)
def tokenize(self, text):
data = []
matches = [m for m in self._regexp.finditer(text)]
for i in range(len(matches)):
# Get text
token = matches[i].group()
# Make normalizations for special token types
if self.substitutions:
groups = matches[i].groupdict()
if groups['sdquote']:
token = "``"
elif groups['edquote']:
token = "''"
elif groups['ssquote']:
token = "`"
elif groups['esquote']:
token = "'"
elif groups['dash']:
token = '--'
elif groups['ellipses']:
token = '...'
# Get whitespace
span = matches[i].span()
start_ws = span[0]
if i + 1 < len(matches):
end_ws = matches[i + 1].span()[0]
else:
end_ws = span[1]
# Format data
data.append((
token,
text[start_ws: end_ws],
span,
))
return Tokens(data, self.annotators)
|