smallcloudteam commited on
Commit
d3e29c0
·
1 Parent(s): 5cc155f

Upload tokenizer

Browse files
special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<|endoftext|>",
3
+ "eos_token": "<|endoftext|>",
4
+ "unk_token": "<|endoftext|>"
5
+ }
tokenization_codify_fast.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from typing import TYPE_CHECKING, List, Optional, Tuple
3
+
4
+ from tokenizers import pre_tokenizers
5
+
6
+ from transformers.tokenization_utils_base import BatchEncoding
7
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
8
+ from transformers.utils import logging
9
+
10
+
11
+ if TYPE_CHECKING:
12
+ from transformers.pipelines.conversational import Conversation
13
+
14
+
15
+ logger = logging.get_logger(__name__)
16
+
17
+ VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"}
18
+
19
+ PRETRAINED_VOCAB_FILES_MAP = {
20
+ "tokenizer_file": {
21
+ "smallcloudai/codify_medium_multi": "https://huggingface.co/smallcloudai/codify_medium_multi/blob/main/tokenizer.json",
22
+ "smallcloudai/codify_3b_multi": "https://huggingface.co/smallcloudai/codify_3b_multi/blob/main/tokenizer.json",
23
+ },
24
+ }
25
+
26
+
27
+ class CodifyTokenizerFast(PreTrainedTokenizerFast):
28
+ vocab_files_names = VOCAB_FILES_NAMES
29
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
30
+ model_input_names = ["input_ids", "attention_mask"]
31
+ slow_tokenizer_class = None
32
+
33
+ def __init__(
34
+ self,
35
+ vocab_file=None,
36
+ merges_file=None,
37
+ tokenizer_file=None,
38
+ unk_token="<|endoftext|>",
39
+ bos_token="<|endoftext|>",
40
+ eos_token="<|endoftext|>",
41
+ add_prefix_space=False,
42
+ **kwargs
43
+ ):
44
+ super().__init__(
45
+ vocab_file,
46
+ merges_file,
47
+ tokenizer_file=tokenizer_file,
48
+ unk_token=unk_token,
49
+ bos_token=bos_token,
50
+ eos_token=eos_token,
51
+ add_prefix_space=add_prefix_space,
52
+ **kwargs,
53
+ )
54
+ pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__())
55
+ if pre_tok_state.get("add_prefix_space", add_prefix_space) != add_prefix_space:
56
+ pre_tok_class = getattr(pre_tokenizers, pre_tok_state.pop("type"))
57
+ pre_tok_state["add_prefix_space"] = add_prefix_space
58
+ self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state)
59
+
60
+ self.add_prefix_space = add_prefix_space
61
+
62
+ def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
63
+ is_split_into_words = kwargs.get("is_split_into_words", False)
64
+ if not (self.add_prefix_space or not is_split_into_words):
65
+ raise Exception(
66
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
67
+ " pretokenized inputs."
68
+ )
69
+
70
+ return super()._batch_encode_plus(*args, **kwargs)
71
+
72
+ def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
73
+ is_split_into_words = kwargs.get("is_split_into_words", False)
74
+
75
+ if not (self.add_prefix_space or not is_split_into_words):
76
+ raise Exception(
77
+ f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with"
78
+ " pretokenized inputs."
79
+ )
80
+
81
+ return super()._encode_plus(*args, **kwargs)
82
+
83
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
84
+ files = self._tokenizer.model.save(save_directory, name=filename_prefix)
85
+ return tuple(files)
86
+
87
+ def _build_conversation_input_ids(self, conversation: "Conversation") -> List[int]:
88
+ """This corresponds to DialoGPT variants of models."""
89
+ input_ids = []
90
+ for is_user, text in conversation.iter_texts():
91
+ input_ids.extend(self.encode(text, add_special_tokens=False) + [self.eos_token_id])
92
+
93
+ if len(input_ids) > self.model_max_length:
94
+ input_ids = input_ids[-self.model_max_length :]
95
+ return input_ids
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "auto_map": {
4
+ "AutoTokenizer": [
5
+ null,
6
+ "tokenization_codify_fast.CodifyTokenizerFast"
7
+ ]
8
+ },
9
+ "bos_token": "<|endoftext|>",
10
+ "eos_token": "<|endoftext|>",
11
+ "tokenizer_class": "CodifyTokenizer",
12
+ "unk_token": "<|endoftext|>"
13
+ }