File size: 1,126 Bytes
a6e5932 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 |
from transformers import BertTokenizer, BasicTokenizer
from transformers.tokenization_utils import _is_punctuation
class OurBasicTokenizer(BasicTokenizer):
def _run_split_on_punc(self, text, never_split=None):
"""Splits punctuation on a piece of text."""
if text in self.never_split or (never_split and text in never_split):
return [text]
chars = list(text)
i = 0
start_new_word = True
output = []
while i < len(chars):
char = chars[i]
if _is_punctuation(char) and char != "'" and not (char == '"' and i + 1 < len(chars) and not _is_punctuation(chars[i + 1])):
output.append([char])
start_new_word = True
else:
if start_new_word:
output.append([])
start_new_word = False
output[-1].append(char)
i += 1
return ["".join(x) for x in output]
def RabbinicTokenizer(tok):
tok.basic_tokenizer = OurBasicTokenizer(tok.basic_tokenizer.do_lower_case, tok.basic_tokenizer.never_split)
return tok
|