riccorl's picture
first commit
626eca0
import logging
from typing import Dict, List, Tuple, Union
import spacy
# from ipa.common.utils import load_spacy
from overrides import overrides
from spacy.cli.download import download as spacy_download
from spacy.tokens import Doc
from relik.common.log import get_logger
from relik.inference.data.objects import Word
from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer
logger = get_logger(level=logging.DEBUG)
# Spacy and Stanza stuff
LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {}
def load_spacy(
language: str,
pos_tags: bool = False,
lemma: bool = False,
parse: bool = False,
split_on_spaces: bool = False,
) -> spacy.Language:
"""
Download and load spacy model.
Args:
language (:obj:`str`, defaults to :obj:`en`):
Language of the text to tokenize.
pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, performs POS tagging with spacy model.
lemma (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, performs lemmatization with spacy model.
parse (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, performs dependency parsing with spacy model.
split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, will split by spaces without performing tokenization.
Returns:
:obj:`spacy.Language`: The spacy model loaded.
"""
exclude = ["vectors", "textcat", "ner"]
if not pos_tags:
exclude.append("tagger")
if not lemma:
exclude.append("lemmatizer")
if not parse:
exclude.append("parser")
# check if the model is already loaded
# if so, there is no need to reload it
spacy_params = (language, pos_tags, lemma, parse, split_on_spaces)
if spacy_params not in LOADED_SPACY_MODELS:
try:
spacy_tagger = spacy.load(language, exclude=exclude)
except OSError:
logger.warning(
"Spacy model '%s' not found. Downloading and installing.", language
)
spacy_download(language)
spacy_tagger = spacy.load(language, exclude=exclude)
# if everything is disabled, return only the tokenizer
# for faster tokenization
# TODO: is it really faster?
# if len(exclude) >= 6:
# spacy_tagger = spacy_tagger.tokenizer
LOADED_SPACY_MODELS[spacy_params] = spacy_tagger
return LOADED_SPACY_MODELS[spacy_params]
class SpacyTokenizer(BaseTokenizer):
"""
A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects.
Args:
language (:obj:`str`, optional, defaults to :obj:`en`):
Language of the text to tokenize.
return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, performs POS tagging with spacy model.
return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, performs lemmatization with spacy model.
return_deps (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, performs dependency parsing with spacy model.
split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, will split by spaces without performing tokenization.
use_gpu (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True`, will load the Stanza model on GPU.
"""
def __init__(
self,
language: str = "en",
return_pos_tags: bool = False,
return_lemmas: bool = False,
return_deps: bool = False,
split_on_spaces: bool = False,
use_gpu: bool = False,
):
super(SpacyTokenizer, self).__init__()
if language not in SPACY_LANGUAGE_MAPPER:
raise ValueError(
f"`{language}` language not supported. The supported "
f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}."
)
if use_gpu:
# load the model on GPU
# if the GPU is not available or not correctly configured,
# it will rise an error
spacy.require_gpu()
self.spacy = load_spacy(
SPACY_LANGUAGE_MAPPER[language],
return_pos_tags,
return_lemmas,
return_deps,
split_on_spaces,
)
self.split_on_spaces = split_on_spaces
def __call__(
self,
texts: Union[str, List[str], List[List[str]]],
is_split_into_words: bool = False,
**kwargs,
) -> Union[List[Word], List[List[Word]]]:
"""
Tokenize the input into single words using SpaCy models.
Args:
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`):
If :obj:`True` and the input is a string, the input is split on spaces.
Returns:
:obj:`List[List[Word]]`: The input text tokenized in single words.
Example::
>>> from ipa import SpacyTokenizer
>>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True)
>>> spacy_tokenizer("Mary sold the car to John.")
"""
# check if input is batched or a single sample
is_batched = self.check_is_batched(texts, is_split_into_words)
if is_batched:
tokenized = self.tokenize_batch(texts)
else:
tokenized = self.tokenize(texts)
return tokenized
@overrides
def tokenize(self, text: Union[str, List[str]]) -> List[Word]:
if self.split_on_spaces:
if isinstance(text, str):
text = text.split(" ")
spaces = [True] * len(text)
text = Doc(self.spacy.vocab, words=text, spaces=spaces)
return self._clean_tokens(self.spacy(text))
@overrides
def tokenize_batch(
self, texts: Union[List[str], List[List[str]]]
) -> List[List[Word]]:
if self.split_on_spaces:
if isinstance(texts[0], str):
texts = [text.split(" ") for text in texts]
spaces = [[True] * len(text) for text in texts]
texts = [
Doc(self.spacy.vocab, words=text, spaces=space)
for text, space in zip(texts, spaces)
]
return [self._clean_tokens(tokens) for tokens in self.spacy.pipe(texts)]
@staticmethod
def _clean_tokens(tokens: Doc) -> List[Word]:
"""
Converts spaCy tokens to :obj:`Word`.
Args:
tokens (:obj:`spacy.tokens.Doc`):
Tokens from SpaCy model.
Returns:
:obj:`List[Word]`: The SpaCy model output converted into :obj:`Word` objects.
"""
words = [
Word(
token.text,
token.i,
token.idx,
token.idx + len(token),
token.lemma_,
token.pos_,
token.dep_,
token.head.i,
)
for token in tokens
]
return words
class WhitespaceSpacyTokenizer:
"""Simple white space tokenizer for SpaCy."""
def __init__(self, vocab):
self.vocab = vocab
def __call__(self, text):
if isinstance(text, str):
words = text.split(" ")
elif isinstance(text, list):
words = text
else:
raise ValueError(
f"text must be either `str` or `list`, found: `{type(text)}`"
)
spaces = [True] * len(words)
return Doc(self.vocab, words=words, spaces=spaces)