File size: 1,126 Bytes
a6e5932
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
from transformers import BertTokenizer, BasicTokenizer
from transformers.tokenization_utils import _is_punctuation

class OurBasicTokenizer(BasicTokenizer):
    def _run_split_on_punc(self, text, never_split=None):
        """Splits punctuation on a piece of text."""
        if text in self.never_split or (never_split and text in never_split):
            return [text]
        chars = list(text)
        i = 0
        start_new_word = True
        output = []
        while i < len(chars):
            char = chars[i]
            if _is_punctuation(char) and char != "'" and not (char == '"' and i + 1 < len(chars) and not _is_punctuation(chars[i + 1])):
                output.append([char])
                start_new_word = True
            else:
                if start_new_word:
                    output.append([])
                start_new_word = False
                output[-1].append(char)
            i += 1

        return ["".join(x) for x in output]


def RabbinicTokenizer(tok):
    tok.basic_tokenizer = OurBasicTokenizer(tok.basic_tokenizer.do_lower_case, tok.basic_tokenizer.never_split)
    return tok