Spaces:
Sleeping
Sleeping
import logging | |
from typing import Dict, List, Tuple, Union | |
import spacy | |
# from ipa.common.utils import load_spacy | |
from overrides import overrides | |
from spacy.cli.download import download as spacy_download | |
from spacy.tokens import Doc | |
from relik.common.log import get_logger | |
from relik.inference.data.objects import Word | |
from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER | |
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer | |
logger = get_logger(level=logging.DEBUG) | |
# Spacy and Stanza stuff | |
LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {} | |
def load_spacy( | |
language: str, | |
pos_tags: bool = False, | |
lemma: bool = False, | |
parse: bool = False, | |
split_on_spaces: bool = False, | |
) -> spacy.Language: | |
""" | |
Download and load spacy model. | |
Args: | |
language (:obj:`str`, defaults to :obj:`en`): | |
Language of the text to tokenize. | |
pos_tags (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, performs POS tagging with spacy model. | |
lemma (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, performs lemmatization with spacy model. | |
parse (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, performs dependency parsing with spacy model. | |
split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, will split by spaces without performing tokenization. | |
Returns: | |
:obj:`spacy.Language`: The spacy model loaded. | |
""" | |
exclude = ["vectors", "textcat", "ner"] | |
if not pos_tags: | |
exclude.append("tagger") | |
if not lemma: | |
exclude.append("lemmatizer") | |
if not parse: | |
exclude.append("parser") | |
# check if the model is already loaded | |
# if so, there is no need to reload it | |
spacy_params = (language, pos_tags, lemma, parse, split_on_spaces) | |
if spacy_params not in LOADED_SPACY_MODELS: | |
try: | |
spacy_tagger = spacy.load(language, exclude=exclude) | |
except OSError: | |
logger.warning( | |
"Spacy model '%s' not found. Downloading and installing.", language | |
) | |
spacy_download(language) | |
spacy_tagger = spacy.load(language, exclude=exclude) | |
# if everything is disabled, return only the tokenizer | |
# for faster tokenization | |
# TODO: is it really faster? | |
# if len(exclude) >= 6: | |
# spacy_tagger = spacy_tagger.tokenizer | |
LOADED_SPACY_MODELS[spacy_params] = spacy_tagger | |
return LOADED_SPACY_MODELS[spacy_params] | |
class SpacyTokenizer(BaseTokenizer): | |
""" | |
A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects. | |
Args: | |
language (:obj:`str`, optional, defaults to :obj:`en`): | |
Language of the text to tokenize. | |
return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, performs POS tagging with spacy model. | |
return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, performs lemmatization with spacy model. | |
return_deps (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, performs dependency parsing with spacy model. | |
split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, will split by spaces without performing tokenization. | |
use_gpu (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True`, will load the Stanza model on GPU. | |
""" | |
def __init__( | |
self, | |
language: str = "en", | |
return_pos_tags: bool = False, | |
return_lemmas: bool = False, | |
return_deps: bool = False, | |
split_on_spaces: bool = False, | |
use_gpu: bool = False, | |
): | |
super(SpacyTokenizer, self).__init__() | |
if language not in SPACY_LANGUAGE_MAPPER: | |
raise ValueError( | |
f"`{language}` language not supported. The supported " | |
f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}." | |
) | |
if use_gpu: | |
# load the model on GPU | |
# if the GPU is not available or not correctly configured, | |
# it will rise an error | |
spacy.require_gpu() | |
self.spacy = load_spacy( | |
SPACY_LANGUAGE_MAPPER[language], | |
return_pos_tags, | |
return_lemmas, | |
return_deps, | |
split_on_spaces, | |
) | |
self.split_on_spaces = split_on_spaces | |
def __call__( | |
self, | |
texts: Union[str, List[str], List[List[str]]], | |
is_split_into_words: bool = False, | |
**kwargs, | |
) -> Union[List[Word], List[List[Word]]]: | |
""" | |
Tokenize the input into single words using SpaCy models. | |
Args: | |
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): | |
Text to tag. It can be a single string, a batch of string and pre-tokenized strings. | |
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): | |
If :obj:`True` and the input is a string, the input is split on spaces. | |
Returns: | |
:obj:`List[List[Word]]`: The input text tokenized in single words. | |
Example:: | |
>>> from ipa import SpacyTokenizer | |
>>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True) | |
>>> spacy_tokenizer("Mary sold the car to John.") | |
""" | |
# check if input is batched or a single sample | |
is_batched = self.check_is_batched(texts, is_split_into_words) | |
if is_batched: | |
tokenized = self.tokenize_batch(texts) | |
else: | |
tokenized = self.tokenize(texts) | |
return tokenized | |
def tokenize(self, text: Union[str, List[str]]) -> List[Word]: | |
if self.split_on_spaces: | |
if isinstance(text, str): | |
text = text.split(" ") | |
spaces = [True] * len(text) | |
text = Doc(self.spacy.vocab, words=text, spaces=spaces) | |
return self._clean_tokens(self.spacy(text)) | |
def tokenize_batch( | |
self, texts: Union[List[str], List[List[str]]] | |
) -> List[List[Word]]: | |
if self.split_on_spaces: | |
if isinstance(texts[0], str): | |
texts = [text.split(" ") for text in texts] | |
spaces = [[True] * len(text) for text in texts] | |
texts = [ | |
Doc(self.spacy.vocab, words=text, spaces=space) | |
for text, space in zip(texts, spaces) | |
] | |
return [self._clean_tokens(tokens) for tokens in self.spacy.pipe(texts)] | |
def _clean_tokens(tokens: Doc) -> List[Word]: | |
""" | |
Converts spaCy tokens to :obj:`Word`. | |
Args: | |
tokens (:obj:`spacy.tokens.Doc`): | |
Tokens from SpaCy model. | |
Returns: | |
:obj:`List[Word]`: The SpaCy model output converted into :obj:`Word` objects. | |
""" | |
words = [ | |
Word( | |
token.text, | |
token.i, | |
token.idx, | |
token.idx + len(token), | |
token.lemma_, | |
token.pos_, | |
token.dep_, | |
token.head.i, | |
) | |
for token in tokens | |
] | |
return words | |
class WhitespaceSpacyTokenizer: | |
"""Simple white space tokenizer for SpaCy.""" | |
def __init__(self, vocab): | |
self.vocab = vocab | |
def __call__(self, text): | |
if isinstance(text, str): | |
words = text.split(" ") | |
elif isinstance(text, list): | |
words = text | |
else: | |
raise ValueError( | |
f"text must be either `str` or `list`, found: `{type(text)}`" | |
) | |
spaces = [True] * len(words) | |
return Doc(self.vocab, words=words, spaces=spaces) | |