sivan22 commited on
Commit
a6e5932
·
1 Parent(s): f705981

Upload 3 files

Browse files
Files changed (2) hide show
  1. config.json +1 -2
  2. rabtokenizer.py +31 -0
config.json CHANGED
@@ -1,5 +1,4 @@
1
  {
2
- "_name_or_path": "BEREL_PyTorch_Model",
3
  "architectures": [
4
  "BertForMaskedLM"
5
  ],
@@ -19,7 +18,7 @@
19
  "pad_token_id": 0,
20
  "position_embedding_type": "absolute",
21
  "torch_dtype": "float32",
22
- "transformers_version": "4.28.1",
23
  "type_vocab_size": 2,
24
  "use_cache": true,
25
  "vocab_size": 128000
 
1
  {
 
2
  "architectures": [
3
  "BertForMaskedLM"
4
  ],
 
18
  "pad_token_id": 0,
19
  "position_embedding_type": "absolute",
20
  "torch_dtype": "float32",
21
+ "transformers_version": "4.12.0.dev0",
22
  "type_vocab_size": 2,
23
  "use_cache": true,
24
  "vocab_size": 128000
rabtokenizer.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import BertTokenizer, BasicTokenizer
2
+ from transformers.tokenization_utils import _is_punctuation
3
+
4
+ class OurBasicTokenizer(BasicTokenizer):
5
+ def _run_split_on_punc(self, text, never_split=None):
6
+ """Splits punctuation on a piece of text."""
7
+ if text in self.never_split or (never_split and text in never_split):
8
+ return [text]
9
+ chars = list(text)
10
+ i = 0
11
+ start_new_word = True
12
+ output = []
13
+ while i < len(chars):
14
+ char = chars[i]
15
+ if _is_punctuation(char) and char != "'" and not (char == '"' and i + 1 < len(chars) and not _is_punctuation(chars[i + 1])):
16
+ output.append([char])
17
+ start_new_word = True
18
+ else:
19
+ if start_new_word:
20
+ output.append([])
21
+ start_new_word = False
22
+ output[-1].append(char)
23
+ i += 1
24
+
25
+ return ["".join(x) for x in output]
26
+
27
+
28
+ def RabbinicTokenizer(tok):
29
+ tok.basic_tokenizer = OurBasicTokenizer(tok.basic_tokenizer.do_lower_case, tok.basic_tokenizer.never_split)
30
+ return tok
31
+