File size: 5,726 Bytes
2f044c1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
from typing import Any, Iterable, List, Optional, Union
import spacy
from relik.inference.data.objects import Word
from relik.inference.data.splitters.base_sentence_splitter import BaseSentenceSplitter
from relik.inference.data.tokenizers.spacy_tokenizer import load_spacy
SPACY_LANGUAGE_MAPPER = {
"cs": "xx_sent_ud_sm",
"da": "xx_sent_ud_sm",
"de": "xx_sent_ud_sm",
"fa": "xx_sent_ud_sm",
"fi": "xx_sent_ud_sm",
"fr": "xx_sent_ud_sm",
"el": "el_core_news_sm",
"en": "xx_sent_ud_sm",
"es": "xx_sent_ud_sm",
"ga": "xx_sent_ud_sm",
"hr": "xx_sent_ud_sm",
"id": "xx_sent_ud_sm",
"it": "xx_sent_ud_sm",
"ja": "ja_core_news_sm",
"lv": "xx_sent_ud_sm",
"lt": "xx_sent_ud_sm",
"mr": "xx_sent_ud_sm",
"nb": "xx_sent_ud_sm",
"nl": "xx_sent_ud_sm",
"no": "xx_sent_ud_sm",
"pl": "pl_core_news_sm",
"pt": "xx_sent_ud_sm",
"ro": "xx_sent_ud_sm",
"ru": "xx_sent_ud_sm",
"sk": "xx_sent_ud_sm",
"sr": "xx_sent_ud_sm",
"sv": "xx_sent_ud_sm",
"te": "xx_sent_ud_sm",
"vi": "xx_sent_ud_sm",
"zh": "zh_core_web_sm",
}
class SpacySentenceSplitter(BaseSentenceSplitter):
"""
A :obj:`SentenceSplitter` that uses spaCy's built-in sentence boundary detection.
Args:
language (:obj:`str`, optional, defaults to :obj:`en`):
Language of the text to tokenize.
model_type (:obj:`str`, optional, defaults to :obj:`statistical`):
Three different type of sentence splitter:
- ``dependency``: sentence splitter uses a dependency parse to detect sentence boundaries,
slow, but accurate.
- ``statistical``:
- ``rule_based``: It's fast and has a small memory footprint, since it uses punctuation to detect
sentence boundaries.
"""
def __init__(self, language: str = "en", model_type: str = "statistical") -> None:
# we need spacy's dependency parser if we're not using rule-based sentence boundary detection.
# self.spacy = get_spacy_model(language, parse=not rule_based, ner=False)
dep = bool(model_type == "dependency")
if language in SPACY_LANGUAGE_MAPPER:
self.spacy = load_spacy(SPACY_LANGUAGE_MAPPER[language], parse=dep)
else:
self.spacy = spacy.blank(language)
# force type to rule_based since there is no pre-trained model
model_type = "rule_based"
if model_type == "dependency":
# dependency type must declared at model init
pass
elif model_type == "statistical":
if not self.spacy.has_pipe("senter"):
self.spacy.enable_pipe("senter")
elif model_type == "rule_based":
# we use `sentencizer`, a built-in spacy module for rule-based sentence boundary detection.
# depending on the spacy version, it could be called 'sentencizer' or 'sbd'
if not self.spacy.has_pipe("sentencizer"):
self.spacy.add_pipe("sentencizer")
else:
raise ValueError(
f"type {model_type} not supported. Choose between `dependency`, `statistical` or `rule_based`"
)
def __call__(
self,
texts: Union[str, List[str], List[List[str]]],
max_length: Optional[int] = None,
is_split_into_words: bool = False,
**kwargs,
) -> Union[List[str], List[List[str]]]:
"""
Tokenize the input into single words using SpaCy models.
Args:
texts (:obj:`str`, :obj:`List[str]`, :obj:`List[List[str]]`):
Text to tag. It can be a single string, a batch of string and pre-tokenized strings.
max_len (:obj:`int`, optional, defaults to :obj:`0`):
Maximum length of a single text. If the text is longer than `max_len`, it will be split
into multiple sentences.
Returns:
:obj:`List[List[str]]`: The input doc split into sentences.
"""
# check if input is batched or a single sample
is_batched = self.check_is_batched(texts, is_split_into_words)
if is_batched:
sents = self.split_sentences_batch(texts)
else:
sents = self.split_sentences(texts, max_length)
return sents
@staticmethod
def chunked(iterable, n: int) -> Iterable[List[Any]]:
"""
Chunks a list into n sized chunks.
Args:
iterable (:obj:`List[Any]`):
List to chunk.
n (:obj:`int`):
Size of the chunks.
Returns:
:obj:`Iterable[List[Any]]`: The input list chunked into n sized chunks.
"""
return [iterable[i : i + n] for i in range(0, len(iterable), n)]
def split_sentences(
self, text: str | List[Word], max_length: Optional[int] = None, *args, **kwargs
) -> List[str]:
"""
Splits a `text` into smaller sentences.
Args:
text (:obj:`str`):
Text to split.
max_length (:obj:`int`, optional, defaults to :obj:`0`):
Maximum length of a single sentence. If the text is longer than `max_len`, it will be split
into multiple sentences.
Returns:
:obj:`List[str]`: The input text split into sentences.
"""
sentences = [sent for sent in self.spacy(text).sents]
if max_length is not None and max_length > 0:
sentences = [
chunk
for sentence in sentences
for chunk in self.chunked(sentence, max_length)
]
return sentences
|