Update tokenization_xgen.py
#28
by
rooa
- opened
- tokenization_xgen.py +12 -0
tokenization_xgen.py
CHANGED
@@ -60,9 +60,18 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
|
60 |
]
|
61 |
return fim_tokens
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
add_whitespaces = include_whitespace(n_min=2, n_max=32)
|
64 |
add_tabs = include_tabs(n_min=2, n_max=10)
|
65 |
fim_tokens = include_fim_tokens()
|
|
|
66 |
|
67 |
tokenizer = tiktoken.get_encoding(base)
|
68 |
|
@@ -82,6 +91,9 @@ def tiktoken_tokenizer(base="gpt2", pad_token=None, add_special=True):
|
|
82 |
for sp in fim_tokens:
|
83 |
special_tokens[sp] = idx
|
84 |
idx += 1
|
|
|
|
|
|
|
85 |
|
86 |
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
87 |
special_tokens[pad_token] = idx
|
|
|
60 |
]
|
61 |
return fim_tokens
|
62 |
|
63 |
+
def include_additional_tokens():
|
64 |
+
tokens = []
|
65 |
+
tokens += [f"<dummy_{i}>" for i in range(4)]
|
66 |
+
tokens.append("<sep>") # 50317
|
67 |
+
tokens.append("<eom>") # 50318
|
68 |
+
tokens += [f"<mask_{i}>" for i in reversed(range(1, 51199-50318+1))]
|
69 |
+
return tokens
|
70 |
+
|
71 |
add_whitespaces = include_whitespace(n_min=2, n_max=32)
|
72 |
add_tabs = include_tabs(n_min=2, n_max=10)
|
73 |
fim_tokens = include_fim_tokens()
|
74 |
+
additional_tokens = include_additional_tokens()
|
75 |
|
76 |
tokenizer = tiktoken.get_encoding(base)
|
77 |
|
|
|
91 |
for sp in fim_tokens:
|
92 |
special_tokens[sp] = idx
|
93 |
idx += 1
|
94 |
+
for sp in additional_tokens:
|
95 |
+
special_tokens[sp] = idx
|
96 |
+
idx += 1
|
97 |
|
98 |
if pad_token and pad_token not in tokenizer._special_tokens and pad_token not in special_tokens:
|
99 |
special_tokens[pad_token] = idx
|