Spaces:
Sleeping
Sleeping
File size: 2,242 Bytes
626eca0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import re
from typing import List, Union
from overrides import overrides
from relik.inference.data.objects import Word
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
class WhitespaceTokenizer(BaseTokenizer):
"""
A :obj:`Tokenizer` that splits the text on spaces.
"""
def __init__(self):
super(WhitespaceTokenizer, self).__init__()
self.whitespace_regex = re.compile(r"\S+")
def __call__(
self,
texts: Union[str, List[str], List[List[str]]],
is_split_into_words: bool = False,
**kwargs,
) -> List[List[Word]]:
"""
Tokenize the input into single words by splitting on spaces.
Args:
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True` and the input is a string, the input is split on spaces.
Returns:
:obj:`List[List[Word]]`: The input text tokenized in single words.
Example::
>>> from nlp_preprocessing_wrappers import WhitespaceTokenizer
>>> whitespace_tokenizer = WhitespaceTokenizer()
>>> whitespace_tokenizer("Mary sold the car to John .")
"""
# check if input is batched or a single sample
is_batched = self.check_is_batched(texts, is_split_into_words)
if is_batched:
tokenized = self.tokenize_batch(texts)
else:
tokenized = self.tokenize(texts)
return tokenized
@overrides
def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
if not isinstance(text, (str, list)):
raise ValueError(
f"text must be either `str` or `list`, found: `{type(text)}`"
)
if isinstance(text, list):
text = " ".join(text)
return [
Word(t[0], i, start_char=t[1], end_char=t[2])
for i, t in enumerate(
(m.group(0), m.start(), m.end())
for m in self.whitespace_regex.finditer(text)
)
]
|