|
import logging |
|
from copy import deepcopy |
|
from typing import Dict, List, Tuple, Union, Any |
|
|
|
import spacy |
|
|
|
|
|
from spacy.cli.download import download as spacy_download |
|
from spacy.tokens import Doc |
|
|
|
from relik.common.log import get_logger |
|
from relik.inference.data.objects import Word |
|
from relik.inference.data.tokenizers import SPACY_LANGUAGE_MAPPER |
|
from relik.inference.data.tokenizers.base_tokenizer import BaseTokenizer |
|
|
|
logger = get_logger(level=logging.DEBUG) |
|
|
|
|
|
|
|
LOADED_SPACY_MODELS: Dict[Tuple[str, bool, bool, bool, bool], spacy.Language] = {} |
|
|
|
|
|
def load_spacy( |
|
language: str, |
|
pos_tags: bool = False, |
|
lemma: bool = False, |
|
parse: bool = False, |
|
split_on_spaces: bool = False, |
|
) -> spacy.Language: |
|
""" |
|
Download and load spacy model. |
|
|
|
Args: |
|
language (:obj:`str`, defaults to :obj:`en`): |
|
Language of the text to tokenize. |
|
pos_tags (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, performs POS tagging with spacy model. |
|
lemma (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, performs lemmatization with spacy model. |
|
parse (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, performs dependency parsing with spacy model. |
|
split_on_spaces (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, will split by spaces without performing tokenization. |
|
|
|
Returns: |
|
:obj:`spacy.Language`: The spacy model loaded. |
|
""" |
|
exclude = ["vectors", "textcat", "ner"] |
|
if not pos_tags: |
|
exclude.append("tagger") |
|
if not lemma: |
|
exclude.append("lemmatizer") |
|
if not parse: |
|
exclude.append("parser") |
|
|
|
|
|
|
|
spacy_params = (language, pos_tags, lemma, parse, split_on_spaces) |
|
if spacy_params not in LOADED_SPACY_MODELS: |
|
try: |
|
spacy_tagger = spacy.load(language, exclude=exclude) |
|
except OSError: |
|
logger.warning( |
|
"Spacy model '%s' not found. Downloading and installing.", language |
|
) |
|
spacy_download(language) |
|
spacy_tagger = spacy.load(language, exclude=exclude) |
|
|
|
|
|
|
|
|
|
|
|
if len(exclude) >= 6 and split_on_spaces: |
|
spacy_tagger = spacy_tagger.tokenizer |
|
LOADED_SPACY_MODELS[spacy_params] = spacy_tagger |
|
|
|
return LOADED_SPACY_MODELS[spacy_params] |
|
|
|
|
|
class SpacyTokenizer(BaseTokenizer): |
|
""" |
|
A :obj:`Tokenizer` that uses SpaCy to tokenizer and preprocess the text. It returns :obj:`Word` objects. |
|
|
|
Args: |
|
language (:obj:`str`, optional, defaults to :obj:`en`): |
|
Language of the text to tokenize. |
|
return_pos_tags (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, performs POS tagging with spacy model. |
|
return_lemmas (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, performs lemmatization with spacy model. |
|
return_deps (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, performs dependency parsing with spacy model. |
|
use_gpu (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True`, will load the Stanza model on GPU. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
language: str = "en", |
|
return_pos_tags: bool = False, |
|
return_lemmas: bool = False, |
|
return_deps: bool = False, |
|
use_gpu: bool = False, |
|
): |
|
super().__init__() |
|
if language not in SPACY_LANGUAGE_MAPPER: |
|
raise ValueError( |
|
f"`{language}` language not supported. The supported " |
|
f"languages are: {list(SPACY_LANGUAGE_MAPPER.keys())}." |
|
) |
|
if use_gpu: |
|
|
|
|
|
|
|
spacy.require_gpu() |
|
self.spacy = load_spacy( |
|
SPACY_LANGUAGE_MAPPER[language], |
|
return_pos_tags, |
|
return_lemmas, |
|
return_deps, |
|
) |
|
|
|
def __call__( |
|
self, |
|
texts: Union[str, List[str], List[List[str]]], |
|
is_split_into_words: bool = False, |
|
**kwargs, |
|
) -> Union[List[Word], List[List[Word]]]: |
|
""" |
|
Tokenize the input into single words using SpaCy models. |
|
|
|
Args: |
|
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): |
|
Text to tag. It can be a single string, a batch of string and pre-tokenized strings. |
|
is_split_into_words (:obj:`bool`, optional, defaults to :obj:`False`): |
|
If :obj:`True` and the input is a string, the input is split on spaces. |
|
|
|
Returns: |
|
:obj:`List[List[Word]]`: The input text tokenized in single words. |
|
|
|
Example:: |
|
|
|
>>> from relik.inference.data.tokenizers.spacy_tokenizer import SpacyTokenizer |
|
|
|
>>> spacy_tokenizer = SpacyTokenizer(language="en", pos_tags=True, lemma=True) |
|
>>> spacy_tokenizer("Mary sold the car to John.") |
|
|
|
""" |
|
|
|
is_batched = self.check_is_batched(texts, is_split_into_words) |
|
|
|
if is_batched: |
|
tokenized = self.tokenize_batch(texts, is_split_into_words) |
|
else: |
|
tokenized = self.tokenize(texts, is_split_into_words) |
|
|
|
return tokenized |
|
|
|
def tokenize(self, text: Union[str, List[str]], is_split_into_words: bool) -> Doc: |
|
if is_split_into_words: |
|
if isinstance(text, str): |
|
text = text.split(" ") |
|
elif isinstance(text, list): |
|
text = text |
|
else: |
|
raise ValueError( |
|
f"text must be either `str` or `list`, found: `{type(text)}`" |
|
) |
|
spaces = [True] * len(text) |
|
return self.spacy(Doc(self.spacy.vocab, words=text, spaces=spaces)) |
|
return self.spacy(text) |
|
|
|
def tokenize_batch( |
|
self, texts: Union[List[str], List[List[str]]], is_split_into_words: bool |
|
) -> list[Any] | list[Doc]: |
|
try: |
|
if is_split_into_words: |
|
if isinstance(texts[0], str): |
|
texts = [text.split(" ") for text in texts] |
|
elif isinstance(texts[0], list): |
|
texts = texts |
|
else: |
|
raise ValueError( |
|
f"text must be either `str` or `list`, found: `{type(texts[0])}`" |
|
) |
|
spaces = [[True] * len(text) for text in texts] |
|
texts = [ |
|
Doc(self.spacy.vocab, words=text, spaces=space) |
|
for text, space in zip(texts, spaces) |
|
] |
|
return list(self.spacy.pipe(texts)) |
|
except AttributeError: |
|
|
|
return [self.spacy(tokens) for tokens in texts] |
|
|