Spaces:
Build error
Build error
#!/usr/bin/env python3 | |
# Copyright 2017-present, Facebook, Inc. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
"""Basic tokenizer that splits text into alpha-numeric tokens and | |
non-whitespace tokens. | |
""" | |
import regex | |
import logging | |
from .tokenizer import Tokens, Tokenizer | |
logger = logging.getLogger(__name__) | |
class SimpleTokenizer(Tokenizer): | |
ALPHA_NUM = r'[\p{L}\p{N}\p{M}]+' | |
NON_WS = r'[^\p{Z}\p{C}]' | |
def __init__(self, **kwargs): | |
""" | |
Args: | |
annotators: None or empty set (only tokenizes). | |
""" | |
self._regexp = regex.compile( | |
'(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), | |
flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE | |
) | |
if len(kwargs.get('annotators', {})) > 0: | |
logger.warning('%s only tokenizes! Skipping annotators: %s' % | |
(type(self).__name__, kwargs.get('annotators'))) | |
self.annotators = set() | |
def tokenize(self, text): | |
data = [] | |
matches = [m for m in self._regexp.finditer(text)] | |
for i in range(len(matches)): | |
# Get text | |
token = matches[i].group() | |
# Get whitespace | |
span = matches[i].span() | |
start_ws = span[0] | |
if i + 1 < len(matches): | |
end_ws = matches[i + 1].span()[0] | |
else: | |
end_ws = span[1] | |
# Format data | |
data.append(( | |
token, | |
text[start_ws: end_ws], | |
span, | |
)) | |
return Tokens(data, self.annotators) | |