pragaash commited on
Commit
ebac1e8
1 Parent(s): 158e9d1

Remove tokenizer.json and replace tokenizer.py with correct version.

Browse files
Files changed (2) hide show
  1. tokenizer.json +0 -129
  2. tokenizer.py +106 -108
tokenizer.json DELETED
@@ -1,129 +0,0 @@
1
- # based on https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
2
- from __future__ import annotations
3
-
4
- import torch
5
-
6
- import numpy as np
7
-
8
- from os import PathLike
9
- from typing import List, Tuple
10
-
11
- from tokenizers import Tokenizer
12
- from transformers.tokenization_utils import PreTrainedTokenizer
13
- from transformers.tokenization_utils_base import BatchEncoding, TruncationStrategy
14
- from transformers.utils.generic import TensorType, PaddingStrategy
15
-
16
-
17
- EMPTY: str = ""
18
-
19
-
20
- class ByteTokenizer(PreTrainedTokenizer):
21
-
22
- """UTF-8 Encoder."""
23
-
24
- @classmethod
25
- def from_pretrained(cls, model_id: str | PathLike, **kwargs) -> ByteTokenizer:
26
-
27
- return cls(**kwargs, byte_level=True)
28
-
29
- @property
30
- def vocab_size(self) -> int:
31
-
32
- return 512
33
-
34
- @property
35
- def byte_level(self) -> bool:
36
-
37
- return self.init_kwargs.get('byte_level', True)
38
-
39
- def get_vocab(self) -> Dict[str, int]:
40
-
41
- return {chr(i): i for i in range(self.vocab_size)}
42
-
43
- def __len__(self) -> int:
44
-
45
- return self.vocab_size
46
-
47
- def clamp(self, n: int) -> int:
48
-
49
- return max(32, min(n, self.vocab_size))
50
-
51
- def _tokenize(self, text: str, **kwargs) -> List[str]:
52
-
53
- return list(text)
54
-
55
- def byte_tokenize(self, text: str) -> np.ndarray:
56
-
57
- return np.frombuffer(text.encode('utf-8'), dtype=np.uint8)
58
-
59
- def _convert_token_to_id(self, token: str) -> int:
60
-
61
- return self.clamp(ord(token))
62
-
63
- def _convert_id_to_token(self, index: int) -> str:
64
-
65
- return chr(self.clamp(index))
66
-
67
- def convert_tokens_to_string(self, tokens: List[str]) -> str:
68
-
69
- return EMPTY.join(tokens)
70
-
71
- def _decode(self, token_ids: List[int], **kwargs) -> str:
72
-
73
- indices = np.asarray(token_ids, dtype=np.uint8)
74
-
75
- return (
76
- indices.clip(min=32, max=self.vocab_size, out=indices)
77
- .tobytes()
78
- .decode('utf-8')
79
- )
80
-
81
- def _encode_plus(self, text: str, **kwargs) -> BatchEncoding:
82
-
83
- first_ids = self.byte_tokenize(text).tolist()
84
-
85
- return self.prepare_for_model(
86
- first_ids,
87
- pair_ids=None,
88
- add_special_tokens=kwargs.get('add_special_tokens', False),
89
- padding=kwargs.get('padding_strategy', PaddingStrategy.DO_NOT_PAD).value,
90
- truncation=kwargs.get('truncation_strategy', TruncationStrategy.DO_NOT_TRUNCATE).value,
91
- max_length=kwargs.get('max_length'),
92
- stride=kwargs.get('stride', 0),
93
- pad_to_multiple_of=kwargs.get('pad_to_multiple_of'),
94
- return_tensors=kwargs.get('return_tensors'),
95
- prepend_batch_axis=True,
96
- return_attention_mask=kwargs.get('return_attention_mask'),
97
- return_token_type_ids=kwargs.get('return_token_type_ids'),
98
- return_overflowing_tokens=kwargs.get('return_overflowing_tokens', False),
99
- return_special_tokens_mask=kwargs.get('return_special_tokens_mask', False),
100
- return_length=kwargs.get('return_length', False),
101
- verbose=kwargs.get('verbose', True),
102
- )
103
-
104
- def _batch_encode_plus(self, batch_text: List[str], **kwargs) -> BatchEncoding:
105
-
106
- input_ids = [(self.byte_tokenize(text).tolist(), None) for text in batch_text]
107
-
108
- return self._batch_prepare_for_model(
109
- input_ids,
110
- add_special_tokens=kwargs.get('add_special_tokens', False),
111
- padding_strategy=kwargs.get('padding_strategy', PaddingStrategy.DO_NOT_PAD),
112
- truncation_strategy=kwargs.get('truncation_strategy', TruncationStrategy.DO_NOT_TRUNCATE),
113
- max_length=kwargs.get('max_length'),
114
- stride=kwargs.get('stride', 0),
115
- pad_to_multiple_of=kwargs.get('pad_to_multiple_of'),
116
- return_attention_mask=kwargs.get('return_attention_mask'),
117
- return_token_type_ids=kwargs.get('return_token_type_ids'),
118
- return_overflowing_tokens=kwargs.get('return_overflowing_tokens', False),
119
- return_special_tokens_mask=kwargs.get('return_special_tokens_mask', False),
120
- return_length=kwargs.get('return_length', False),
121
- return_tensors=kwargs.get('return_tensors'),
122
- verbose=kwargs.get('verbose', True),
123
- )
124
-
125
- def _save_pretrained(
126
- self, save_directory: str | PathLike, file_names: Tuple[str], **kwargs
127
- ) -> Tuple[str]:
128
-
129
- return file_names
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer.py CHANGED
@@ -1,131 +1,129 @@
1
  # based on https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
2
- from abc import ABC
3
- import json
4
- import pathlib
5
 
6
  import torch
7
- import tqdm
8
- from tokenizers import Tokenizer
9
- from transformers.tokenization_utils import PreTrainedTokenizer
10
- from abc import abstractmethod
11
- from typing import Any, List, Union
12
  import numpy as np
13
 
 
 
14
 
15
- class HFAutoTokenizer:
16
- def __init__(self, vocab_file):
17
- self.tokenizer = Tokenizer.from_file(vocab_file)
18
- self.eos = "</s>"
19
- self.bos = "<s>"
20
- self.eos_id = self.tokenize(self.eos)
21
- self.bos_id = self.tokenize(self.bos)
22
- self.vsize = 32000
23
-
24
- def encode_to_list(self, text):
25
- return self.tokenizer.encode(text, add_special_tokens=False)
26
-
27
- def tokenize_file(self, input_file, output_file, verbose=False):
28
- if verbose:
29
- print(f"Tokenizing file: {input_file}")
30
-
31
- if pathlib.Path(output_file).exists():
32
- print(f"Output file {output_file} already exists, skipping")
33
- return
34
- with open(input_file, "r") as fin, open(output_file, "w") as fout:
35
- for line in tqdm.tqdm(fin):
36
- if verbose:
37
- print(f"Tokenizing line: {line[-200:]}")
38
- data = json.loads(line.strip())
39
- if "text" not in data.keys():
40
- break
41
- tokenized_data = self.tokenize(data["text"])
42
- fout.write(json.dumps({"tokens": tokenized_data}) + "\n")
43
-
44
- def tokenize(self, text: str, *args, **kwargs):
45
- ids = self.tokenizer.encode(text)
46
- if type(ids) == list:
47
- return torch.tensor(ids)
48
- else:
49
- return torch.tensor(ids.ids)
50
-
51
- def tokenize_batch(self, text_batch):
52
- return self.tokenizer.encode_batch(text_batch)
53
-
54
- def detokenize(self, token_ids, skip_special_tokens=False):
55
- return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
56
-
57
- def detokenize_batch(self, token_ids_batch, skip_special_tokens=False):
58
- out = []
59
- for token_ids in token_ids_batch:
60
- out.append(
61
- self.detokenize(
62
- [t.item() for t in token_ids],
63
- skip_special_tokens=skip_special_tokens,
64
- )
65
- )
66
- return out
67
 
68
- @property
69
- def eod(self):
70
- return self.eod_id
71
 
72
- @property
73
- def vocab_size(self):
74
- return 32000
75
 
76
 
77
  class ByteTokenizer(PreTrainedTokenizer):
 
78
  """UTF-8 Encoder."""
79
- def __init__(self):
80
- super().__init__(
81
- bos_token=self.decode_token(2),
82
- eos_token=self.decode_token(0),
83
- unk_token=self.decode_token(0),
84
- pad_token=self.decode_token(1),
85
- mask_token=self.decode_token(3),
86
- )
87
-
88
  @property
89
  def vocab_size(self) -> int:
 
90
  return 512
91
 
92
- @classmethod
93
- def from_pretrained(cls, *args, **kwargs):
94
- return cls()
 
 
 
 
 
95
 
96
- def get_vocab(self):
97
- return {str(i): i for i in range(512)}
 
 
 
98
 
99
- def clamp(self, n):
100
  return max(32, min(n, self.vocab_size))
101
 
102
- def decode_token(self, token: int):
103
- return str(chr(self.clamp(token)))
 
104
 
105
- def __call__(self, text: str, return_tensors: bool = False, *args, **kwargs):
106
- ids = torch.tensor(self.tokenize(text), dtype=torch.long).unsqueeze(0)
107
- return {"input_ids": ids} if return_tensors == False else ids
108
 
109
- def _tokenize(self, text: str):
110
  return np.frombuffer(text.encode('utf-8'), dtype=np.uint8)
111
-
112
- def tokenize(self, text: str):
113
- return self._tokenize(text).tolist()
114
-
115
- def tokenize_batch(self, text_batch: Union[List[str], str]):
116
- if isinstance(text_batch, list):
117
- return [self.tokenize(s) for s in text_batch]
118
- else:
119
- return self.tokenize(text_batch)
120
-
121
- def decode(self, token_ids):
122
- return "".join(list(map(self.decode_token, token_ids)))
123
-
124
- def decode_batch(self, token_ids: Union[List[str], str]):
125
- if isinstance(token_ids, list):
126
- return [self.decode(s) for s in token_ids]
127
-
128
- elif isinstance(token_ids, torch.Tensor):
129
- return [self.decode(s) for s in token_ids.tolist()]
130
- else:
131
- return self.decode(token_ids)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # based on https://github.com/EleutherAI/gpt-neox/blob/main/megatron/tokenizer/tokenizer.py
2
+ from __future__ import annotations
 
 
3
 
4
  import torch
5
+
 
 
 
 
6
  import numpy as np
7
 
8
+ from os import PathLike
9
+ from typing import List, Tuple
10
 
11
+ from tokenizers import Tokenizer
12
+ from transformers.tokenization_utils import PreTrainedTokenizer
13
+ from transformers.tokenization_utils_base import BatchEncoding, TruncationStrategy
14
+ from transformers.utils.generic import TensorType, PaddingStrategy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
 
 
 
16
 
17
+ EMPTY: str = ""
 
 
18
 
19
 
20
  class ByteTokenizer(PreTrainedTokenizer):
21
+
22
  """UTF-8 Encoder."""
23
+
24
+ @classmethod
25
+ def from_pretrained(cls, model_id: str | PathLike, **kwargs) -> ByteTokenizer:
26
+
27
+ return cls(**kwargs, byte_level=True)
28
+
 
 
 
29
  @property
30
  def vocab_size(self) -> int:
31
+
32
  return 512
33
 
34
+ @property
35
+ def byte_level(self) -> bool:
36
+
37
+ return self.init_kwargs.get('byte_level', True)
38
+
39
+ def get_vocab(self) -> Dict[str, int]:
40
+
41
+ return {chr(i): i for i in range(self.vocab_size)}
42
 
43
+ def __len__(self) -> int:
44
+
45
+ return self.vocab_size
46
+
47
+ def clamp(self, n: int) -> int:
48
 
 
49
  return max(32, min(n, self.vocab_size))
50
 
51
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
52
+
53
+ return list(text)
54
 
55
+ def byte_tokenize(self, text: str) -> np.ndarray:
 
 
56
 
 
57
  return np.frombuffer(text.encode('utf-8'), dtype=np.uint8)
58
+
59
+ def _convert_token_to_id(self, token: str) -> int:
60
+
61
+ return self.clamp(ord(token))
62
+
63
+ def _convert_id_to_token(self, index: int) -> str:
64
+
65
+ return chr(self.clamp(index))
66
+
67
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
68
+
69
+ return EMPTY.join(tokens)
70
+
71
+ def _decode(self, token_ids: List[int], **kwargs) -> str:
72
+
73
+ indices = np.asarray(token_ids, dtype=np.uint8)
74
+
75
+ return (
76
+ indices.clip(min=32, max=self.vocab_size, out=indices)
77
+ .tobytes()
78
+ .decode('utf-8')
79
+ )
80
+
81
+ def _encode_plus(self, text: str, **kwargs) -> BatchEncoding:
82
+
83
+ first_ids = self.byte_tokenize(text).tolist()
84
+
85
+ return self.prepare_for_model(
86
+ first_ids,
87
+ pair_ids=None,
88
+ add_special_tokens=kwargs.get('add_special_tokens', False),
89
+ padding=kwargs.get('padding_strategy', PaddingStrategy.DO_NOT_PAD).value,
90
+ truncation=kwargs.get('truncation_strategy', TruncationStrategy.DO_NOT_TRUNCATE).value,
91
+ max_length=kwargs.get('max_length'),
92
+ stride=kwargs.get('stride', 0),
93
+ pad_to_multiple_of=kwargs.get('pad_to_multiple_of'),
94
+ return_tensors=kwargs.get('return_tensors'),
95
+ prepend_batch_axis=True,
96
+ return_attention_mask=kwargs.get('return_attention_mask'),
97
+ return_token_type_ids=kwargs.get('return_token_type_ids'),
98
+ return_overflowing_tokens=kwargs.get('return_overflowing_tokens', False),
99
+ return_special_tokens_mask=kwargs.get('return_special_tokens_mask', False),
100
+ return_length=kwargs.get('return_length', False),
101
+ verbose=kwargs.get('verbose', True),
102
+ )
103
+
104
+ def _batch_encode_plus(self, batch_text: List[str], **kwargs) -> BatchEncoding:
105
+
106
+ input_ids = [(self.byte_tokenize(text).tolist(), None) for text in batch_text]
107
+
108
+ return self._batch_prepare_for_model(
109
+ input_ids,
110
+ add_special_tokens=kwargs.get('add_special_tokens', False),
111
+ padding_strategy=kwargs.get('padding_strategy', PaddingStrategy.DO_NOT_PAD),
112
+ truncation_strategy=kwargs.get('truncation_strategy', TruncationStrategy.DO_NOT_TRUNCATE),
113
+ max_length=kwargs.get('max_length'),
114
+ stride=kwargs.get('stride', 0),
115
+ pad_to_multiple_of=kwargs.get('pad_to_multiple_of'),
116
+ return_attention_mask=kwargs.get('return_attention_mask'),
117
+ return_token_type_ids=kwargs.get('return_token_type_ids'),
118
+ return_overflowing_tokens=kwargs.get('return_overflowing_tokens', False),
119
+ return_special_tokens_mask=kwargs.get('return_special_tokens_mask', False),
120
+ return_length=kwargs.get('return_length', False),
121
+ return_tensors=kwargs.get('return_tensors'),
122
+ verbose=kwargs.get('verbose', True),
123
+ )
124
+
125
+ def _save_pretrained(
126
+ self, save_directory: str | PathLike, file_names: Tuple[str], **kwargs
127
+ ) -> Tuple[str]:
128
+
129
+ return file_names