#!/usr/bin/env python3
# Copyright 2017-present, Facebook, Inc.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
"""Regex based tokenizer that emulates the Stanford/NLTK PTB tokenizers.
However it is purely in Python, supports robust untokenization, unicode,
and requires minimal dependencies.
"""
import regex
import logging
from .tokenizer import Tokens, Tokenizer
logger = logging.getLogger(__name__)
class RegexpTokenizer(Tokenizer):
DIGIT = r'\p{Nd}+([:\.\,]\p{Nd}+)*'
TITLE = (r'(dr|esq|hon|jr|mr|mrs|ms|prof|rev|sr|st|rt|messrs|mmes|msgr)'
r'\.(?=\p{Z})')
ABBRV = r'([\p{L}]\.){2,}(?=\p{Z}|$)'
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]++'
HYPHEN = r'{A}([-\u058A\u2010\u2011]{A})+'.format(A=ALPHA_NUM)
NEGATION = r"((?!n't)[\p{L}\p{N}\p{M}])++(?=n't)|n't"
CONTRACTION1 = r"can(?=not\b)"
CONTRACTION2 = r"'([tsdm]|re|ll|ve)\b"
START_DQUOTE = r'(?<=[\p{Z}\(\[{<]|^)(``|["\u0093\u201C\u00AB])(?!\p{Z})'
START_SQUOTE = r'(?<=[\p{Z}\(\[{<]|^)[\'\u0091\u2018\u201B\u2039](?!\p{Z})'
END_DQUOTE = r'(?%s)|(?P
%s)|(?P%s)|(?P%s)|(?P%s)|'
'(?P%s)|(?P%s)|(?P%s)|(?P%s)|'
'(?P%s)|(?P%s)|(?P%s)|(?P%s)|'
'(?%s)|(?P%s)|(?P%s)' %
(self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN,
self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2,
self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE,
self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT,
self.NON_WS),
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE
)
if len(kwargs.get('annotators', {})) > 0:
logger.warning('%s only tokenizes! Skipping annotators: %s' %
(type(self).__name__, kwargs.get('annotators')))
self.annotators = set()
self.substitutions = kwargs.get('substitutions', True)
def tokenize(self, text):
data = []
matches = [m for m in self._regexp.finditer(text)]
for i in range(len(matches)):
# Get text
token = matches[i].group()
# Make normalizations for special token types
if self.substitutions:
groups = matches[i].groupdict()
if groups['sdquote']:
token = "``"
elif groups['edquote']:
token = "''"
elif groups['ssquote']:
token = "`"
elif groups['esquote']:
token = "'"
elif groups['dash']:
token = '--'
elif groups['ellipses']:
token = '...'
# Get whitespace
span = matches[i].span()
start_ws = span[0]
if i + 1 < len(matches):
end_ws = matches[i + 1].span()[0]
else:
end_ws = span[1]
# Format data
data.append((
token,
text[start_ws: end_ws],
span,
))
return Tokens(data, self.annotators)