import logging from typing import Dict, List, Tuple, Union import spacy # from ipa.common.utils import load_spacy from overrides import overrides from spacy.cli.download import download as spacy_download from spacy.tokens import Doc from relik.common.log import get_logger from relik.inference.data.objects import Word from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer logger = get_logger(level=logging.DEBUG) # Spacy and Stanza stuff LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {} def load_spacy( language: str, pos_tags: bool = False, lemma: bool = False, parse: bool = False, split_on_spaces: bool = False, ) -> spacy.Language: """ Download and load spacy model. Args: language (:obj:`str`, defaults to :obj:`en`): Language of the text to tokenize. pos_tags (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, performs POS tagging with spacy model. lemma (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, performs lemmatization with spacy model. parse (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, performs dependency parsing with spacy model. split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, will split by spaces without performing tokenization. Returns: :obj:`spacy.Language`: The spacy model loaded. """ exclude = ["vectors", "textcat", "ner"] if not pos_tags: exclude.append("tagger") if not lemma: exclude.append("lemmatizer") if not parse: exclude.append("parser") # check if the model is already loaded # if so, there is no need to reload it spacy_params = (language, pos_tags, lemma, parse, split_on_spaces) if spacy_params not in LOADED_SPACY_MODELS: try: spacy_tagger = spacy.load(language, exclude=exclude) except OSError: logger.warning( "Spacy model '%s' not found. Downloading and installing.", language ) spacy_download(language) spacy_tagger = spacy.load(language, exclude=exclude) # if everything is disabled, return only the tokenizer # for faster tokenization # TODO: is it really faster? # if len(exclude) >= 6: # spacy_tagger = spacy_tagger.tokenizer LOADED_SPACY_MODELS[spacy_params] = spacy_tagger return LOADED_SPACY_MODELS[spacy_params] class SpacyTokenizer(BaseTokenizer): """ A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects. Args: language (:obj:`str`, optional, defaults to :obj:`en`): Language of the text to tokenize. return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, performs POS tagging with spacy model. return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, performs lemmatization with spacy model. return_deps (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, performs dependency parsing with spacy model. split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, will split by spaces without performing tokenization. use_gpu (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True`, will load the Stanza model on GPU. """ def __init__( self, language: str = "en", return_pos_tags: bool = False, return_lemmas: bool = False, return_deps: bool = False, split_on_spaces: bool = False, use_gpu: bool = False, ): super(SpacyTokenizer, self).__init__() if language not in SPACY_LANGUAGE_MAPPER: raise ValueError( f"`{language}` language not supported. The supported " f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}." ) if use_gpu: # load the model on GPU # if the GPU is not available or not correctly configured, # it will rise an error spacy.require_gpu() self.spacy = load_spacy( SPACY_LANGUAGE_MAPPER[language], return_pos_tags, return_lemmas, return_deps, split_on_spaces, ) self.split_on_spaces = split_on_spaces def __call__( self, texts: Union[str, List[str], List[List[str]]], is_split_into_words: bool = False, **kwargs, ) -> Union[List[Word], List[List[Word]]]: """ Tokenize the input into single words using SpaCy models. Args: texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): Text to tag. It can be a single string, a batch of string and pre-tokenized strings. is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): If :obj:`True` and the input is a string, the input is split on spaces. Returns: :obj:`List[List[Word]]`: The input text tokenized in single words. Example:: >>> from ipa import SpacyTokenizer >>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True) >>> spacy_tokenizer("Mary sold the car to John.") """ # check if input is batched or a single sample is_batched = self.check_is_batched(texts, is_split_into_words) if is_batched: tokenized = self.tokenize_batch(texts) else: tokenized = self.tokenize(texts) return tokenized @overrides def tokenize(self, text: Union[str, List[str]]) -> List[Word]: if self.split_on_spaces: if isinstance(text, str): text = text.split(" ") spaces = [True] * len(text) text = Doc(self.spacy.vocab, words=text, spaces=spaces) return self._clean_tokens(self.spacy(text)) @overrides def tokenize_batch( self, texts: Union[List[str], List[List[str]]] ) -> List[List[Word]]: if self.split_on_spaces: if isinstance(texts[0], str): texts = [text.split(" ") for text in texts] spaces = [[True] * len(text) for text in texts] texts = [ Doc(self.spacy.vocab, words=text, spaces=space) for text, space in zip(texts, spaces) ] return [self._clean_tokens(tokens) for tokens in self.spacy.pipe(texts)] @staticmethod def _clean_tokens(tokens: Doc) -> List[Word]: """ Converts spaCy tokens to :obj:`Word`. Args: tokens (:obj:`spacy.tokens.Doc`): Tokens from SpaCy model. Returns: :obj:`List[Word]`: The SpaCy model output converted into :obj:`Word` objects. """ words = [ Word( token.text, token.i, token.idx, token.idx + len(token), token.lemma_, token.pos_, token.dep_, token.head.i, ) for token in tokens ] return words class WhitespaceSpacyTokenizer: """Simple white space tokenizer for SpaCy.""" def __init__(self, vocab): self.vocab = vocab def __call__(self, text): if isinstance(text, str): words = text.split(" ") elif isinstance(text, list): words = text else: raise ValueError( f"text must be either `str` or `list`, found: `{type(text)}`" ) spaces = [True] * len(words) return Doc(self.vocab, words=words, spaces=spaces)