|
from transformers import BertTokenizer, BasicTokenizer |
|
from transformers.tokenization_utils import _is_punctuation |
|
|
|
class OurBasicTokenizer(BasicTokenizer): |
|
def _run_split_on_punc(self, text, never_split=None): |
|
"""Splits punctuation on a piece of text.""" |
|
if text in self.never_split or (never_split and text in never_split): |
|
return [text] |
|
chars = list(text) |
|
i = 0 |
|
start_new_word = True |
|
output = [] |
|
while i < len(chars): |
|
char = chars[i] |
|
if _is_punctuation(char) and char != "'" and not (char == '"' and i + 1 < len(chars) and not _is_punctuation(chars[i + 1])): |
|
output.append([char]) |
|
start_new_word = True |
|
else: |
|
if start_new_word: |
|
output.append([]) |
|
start_new_word = False |
|
output[-1].append(char) |
|
i += 1 |
|
|
|
return ["".join(x) for x in output] |
|
|
|
|
|
def RabbinicTokenizer(tok): |
|
tok.basic_tokenizer = OurBasicTokenizer(tok.basic_tokenizer.do_lower_case, tok.basic_tokenizer.never_split) |
|
return tok |
|
|
|
|