from typing import Any, Iterable, List, Optional, Union import spacy from relik.inference.data.objects import Word from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter from relik.inference.data.tokenizers.spacy_tokenizer import load_spacy SPACY_LANGUAGE_MAPPER = { "cs": "xx_sent_ud_sm", "da": "xx_sent_ud_sm", "de": "xx_sent_ud_sm", "fa": "xx_sent_ud_sm", "fi": "xx_sent_ud_sm", "fr": "xx_sent_ud_sm", "el": "el_core_news_sm", "en": "xx_sent_ud_sm", "es": "xx_sent_ud_sm", "ga": "xx_sent_ud_sm", "hr": "xx_sent_ud_sm", "id": "xx_sent_ud_sm", "it": "xx_sent_ud_sm", "ja": "ja_core_news_sm", "lv": "xx_sent_ud_sm", "lt": "xx_sent_ud_sm", "mr": "xx_sent_ud_sm", "nb": "xx_sent_ud_sm", "nl": "xx_sent_ud_sm", "no": "xx_sent_ud_sm", "pl": "pl_core_news_sm", "pt": "xx_sent_ud_sm", "ro": "xx_sent_ud_sm", "ru": "xx_sent_ud_sm", "sk": "xx_sent_ud_sm", "sr": "xx_sent_ud_sm", "sv": "xx_sent_ud_sm", "te": "xx_sent_ud_sm", "vi": "xx_sent_ud_sm", "zh": "zh_core_web_sm", } class SpacySentenceSplitter(BaseSentenceSplitter): """ A :obj:`SentenceSplitter` that uses spaCy's built-in sentence boundary detection. Args: language (:obj:`str`, optional, defaults to :obj:`en`): Language of the text to tokenize. model_type (:obj:`str`, optional, defaults to :obj:`statistical`): Three different type of sentence splitter: - ``dependency``: sentence splitter uses a dependency parse to detect sentence boundaries, slow, but accurate. - ``statistical``: - ``rule_based``: It's fast and has a small memory footprint, since it uses punctuation to detect sentence boundaries. """ def __init__(self, language: str = "en", model_type: str = "statistical") -> None: # we need spacy's dependency parser if we're not using rule-based sentence boundary detection. # self.spacy = get_spacy_model(language, parse=not rule_based, ner=False) dep = bool(model_type == "dependency") if language in SPACY_LANGUAGE_MAPPER: self.spacy = load_spacy(SPACY_LANGUAGE_MAPPER[language], parse=dep) else: self.spacy = spacy.blank(language) # force type to rule_based since there is no pre-trained model model_type = "rule_based" if model_type == "dependency": # dependency type must declared at model init pass elif model_type == "statistical": if not self.spacy.has_pipe("senter"): self.spacy.enable_pipe("senter") elif model_type == "rule_based": # we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection. # depending on the spacy version, it could be called 'sentencizer' or 'sbd' if not self.spacy.has_pipe("sentencizer"): self.spacy.add_pipe("sentencizer") else: raise ValueError( f"type {model_type} not supported. Choose between `dependency`, `statistical` or `rule_based`" ) def __call__( self, texts: Union[str, List[str], List[List[str]]], max_length: Optional[int] = None, is_split_into_words: bool = False, **kwargs, ) -> Union[List[str], List[List[str]]]: """ Tokenize the input into single words using SpaCy models. Args: texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`): Text to tag. It can be a single string, a batch of string and pre-tokenized strings. max_len (:obj:`int`, optional, defaults to :obj:`0`): Maximum length of a single text. If the text is longer than `max_len`, it will be split into multiple sentences. Returns: :obj:`List[List[str]]`: The input doc split into sentences. """ # check if input is batched or a single sample is_batched = self.check_is_batched(texts, is_split_into_words) if is_batched: sents = self.split_sentences_batch(texts) else: sents = self.split_sentences(texts, max_length) return sents @staticmethod def chunked(iterable, n: int) -> Iterable[List[Any]]: """ Chunks a list into n sized chunks. Args: iterable (:obj:`List[Any]`): List to chunk. n (:obj:`int`): Size of the chunks. Returns: :obj:`Iterable[List[Any]]`: The input list chunked into n sized chunks. """ return [iterable[i : i + n] for i in range(0, len(iterable), n)] def split_sentences( self, text: str | List[Word], max_length: Optional[int] = None, *args, **kwargs ) -> List[str]: """ Splits a `text` into smaller sentences. Args: text (:obj:`str`): Text to split. max_length (:obj:`int`, optional, defaults to :obj:`0`): Maximum length of a single sentence. If the text is longer than `max_len`, it will be split into multiple sentences. Returns: :obj:`List[str]`: The input text split into sentences. """ sentences = [sent for sent in self.spacy(text).sents] if max_length is not None and max_length > 0: sentences = [ chunk for sentence in sentences for chunk in self.chunked(sentence, max_length) ] return sentences