fast tokenizer and stream_chat fix

#5
by x54-729 - opened
config.json CHANGED
@@ -3,7 +3,7 @@
3
  "InternLM2ForCausalLM"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": "configuration_internlm.InternLMConfig",
7
  "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM",
8
  "AutoModel": "modeling_internlm2.InternLM2ForCausalLM"
9
  },
@@ -15,7 +15,7 @@
15
  "initializer_range": 0.02,
16
  "intermediate_size": 14336,
17
  "max_position_embeddings": 32768,
18
- "model_type": "internlm",
19
  "num_attention_heads": 32,
20
  "num_hidden_layers": 32,
21
  "num_key_value_heads": 8,
 
3
  "InternLM2ForCausalLM"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": "configuration_internlm2.InternLM2Config",
7
  "AutoModelForCausalLM": "modeling_internlm2.InternLM2ForCausalLM",
8
  "AutoModel": "modeling_internlm2.InternLM2ForCausalLM"
9
  },
 
15
  "initializer_range": 0.02,
16
  "intermediate_size": 14336,
17
  "max_position_embeddings": 32768,
18
+ "model_type": "internlm2",
19
  "num_attention_heads": 32,
20
  "num_hidden_layers": 32,
21
  "num_key_value_heads": 8,
configuration_internlm.py → configuration_internlm2.py RENAMED
@@ -1,10 +1,7 @@
1
  # coding=utf-8
2
- # Copyright (c) InternLM. All rights reserved.
3
  #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
  #
9
  # Licensed under the Apache License, Version 2.0 (the "License");
10
  # you may not use this file except in compliance with the License.
@@ -17,21 +14,22 @@
17
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
- """ InternLM model configuration"""
21
 
22
  from transformers.configuration_utils import PretrainedConfig
23
  from transformers.utils import logging
24
 
25
  logger = logging.get_logger(__name__)
26
 
27
- INTERNLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
28
 
29
 
30
- class InternLMConfig(PretrainedConfig):
 
31
  r"""
32
- This is the configuration class to store the configuration of a [`InternLMModel`]. It is used to instantiate
33
- an InternLM model according to the specified arguments, defining the model architecture. Instantiating a
34
- configuration with the defaults will yield a similar configuration to that of the InternLM-7B.
35
 
36
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
37
  documentation from [`PretrainedConfig`] for more information.
@@ -39,8 +37,8 @@ class InternLMConfig(PretrainedConfig):
39
 
40
  Args:
41
  vocab_size (`int`, *optional*, defaults to 32000):
42
- Vocabulary size of the InternLM model. Defines the number of different tokens that can be represented by the
43
- `inputs_ids` passed when calling [`InternLMModel`]
44
  hidden_size (`int`, *optional*, defaults to 4096):
45
  Dimension of the hidden representations.
46
  intermediate_size (`int`, *optional*, defaults to 11008):
@@ -73,19 +71,8 @@ class InternLMConfig(PretrainedConfig):
73
  Whether to tie weight embeddings
74
  Example:
75
 
76
- ```python
77
- >>> from transformers import InternLMModel, InternLMConfig
78
-
79
- >>> # Initializing a InternLM internlm-7b style configuration
80
- >>> configuration = InternLMConfig()
81
-
82
- >>> # Initializing a model from the internlm-7b style configuration
83
- >>> model = InternLMModel(configuration)
84
-
85
- >>> # Accessing the model configuration
86
- >>> configuration = model.config
87
- ```"""
88
- model_type = "internlm"
89
  _auto_class = "AutoConfig"
90
 
91
  def __init__( # pylint: disable=W0102
 
1
  # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
  #
4
+ # This code is based on transformers/src/transformers/models/llama/configuration_llama.py
 
 
 
5
  #
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
 
14
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
+ """ InternLM2 model configuration"""
18
 
19
  from transformers.configuration_utils import PretrainedConfig
20
  from transformers.utils import logging
21
 
22
  logger = logging.get_logger(__name__)
23
 
24
+ INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
25
 
26
 
27
+ # Modified from transformers.model.llama.configuration_llama.LlamaConfig
28
+ class InternLM2Config(PretrainedConfig):
29
  r"""
30
+ This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
31
+ an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
32
+ configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
33
 
34
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
  documentation from [`PretrainedConfig`] for more information.
 
37
 
38
  Args:
39
  vocab_size (`int`, *optional*, defaults to 32000):
40
+ Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
41
+ `inputs_ids` passed when calling [`InternLM2Model`]
42
  hidden_size (`int`, *optional*, defaults to 4096):
43
  Dimension of the hidden representations.
44
  intermediate_size (`int`, *optional*, defaults to 11008):
 
71
  Whether to tie weight embeddings
72
  Example:
73
 
74
+ """
75
+ model_type = "internlm2"
 
 
 
 
 
 
 
 
 
 
 
76
  _auto_class = "AutoConfig"
77
 
78
  def __init__( # pylint: disable=W0102
modeling_internlm2.py CHANGED
@@ -45,7 +45,7 @@ try:
45
  except: # noqa # pylint: disable=bare-except
46
  BaseStreamer = None
47
 
48
- from .configuration_internlm import InternLMConfig as InternLM2Config
49
 
50
  logger = logging.get_logger(__name__)
51
 
@@ -1134,11 +1134,12 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1134
  return reordered_past
1135
 
1136
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
1137
- prompt = ""
1138
- if meta_instruction:
1139
- prompt += f"""<s><|im_start|>system\n{meta_instruction}<|im_end|>\n"""
1140
  else:
1141
- prompt += "<s>"
 
 
1142
  for record in history:
1143
  prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
1144
  prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
@@ -1214,6 +1215,7 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1214
  self.query = query
1215
  self.history = history
1216
  self.response = ""
 
1217
  self.received_inputs = False
1218
  self.queue.put((self.response, history + [(self.query, self.response)]))
1219
 
@@ -1228,11 +1230,15 @@ class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1228
  self.received_inputs = True
1229
  return
1230
 
1231
- token = self.tokenizer.decode([value[-1]], skip_special_tokens=True)
 
1232
  if token.strip() != "<|im_end|>":
1233
  self.response = self.response + token
1234
  history = self.history + [(self.query, self.response)]
1235
  self.queue.put((self.response, history))
 
 
 
1236
 
1237
  def end(self):
1238
  self.queue.put(None)
 
45
  except: # noqa # pylint: disable=bare-except
46
  BaseStreamer = None
47
 
48
+ from .configuration_internlm2 import InternLM2Config
49
 
50
  logger = logging.get_logger(__name__)
51
 
 
1134
  return reordered_past
1135
 
1136
  def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = [], meta_instruction=""):
1137
+ if tokenizer.add_bos_token:
1138
+ prompt = ""
 
1139
  else:
1140
+ prompt = tokenizer.bos_token
1141
+ if meta_instruction:
1142
+ prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
1143
  for record in history:
1144
  prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
1145
  prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
 
1215
  self.query = query
1216
  self.history = history
1217
  self.response = ""
1218
+ self.chat = []
1219
  self.received_inputs = False
1220
  self.queue.put((self.response, history + [(self.query, self.response)]))
1221
 
 
1230
  self.received_inputs = True
1231
  return
1232
 
1233
+ self.cache.extend(value.tolist())
1234
+ token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
1235
  if token.strip() != "<|im_end|>":
1236
  self.response = self.response + token
1237
  history = self.history + [(self.query, self.response)]
1238
  self.queue.put((self.response, history))
1239
+ self.cache = []
1240
+ else:
1241
+ self.end()
1242
 
1243
  def end(self):
1244
  self.queue.put(None)
tokenization_internlm.py → tokenization_internlm2.py RENAMED
@@ -1,10 +1,7 @@
1
  # coding=utf-8
2
- # Copyright (c) InternLM. All rights reserved.
3
  #
4
- # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
- # and OPT implementations in this library. It has been modified from its
6
- # original forms to accommodate minor architectural differences compared
7
- # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
  #
9
  # Licensed under the Apache License, Version 2.0 (the "License");
10
  # you may not use this file except in compliance with the License.
@@ -18,7 +15,7 @@
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
 
21
- """Tokenization classes for IntermLM."""
22
  import os
23
  from shutil import copyfile
24
  from typing import Any, Dict, List, Optional, Tuple
@@ -34,9 +31,10 @@ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
34
  PRETRAINED_VOCAB_FILES_MAP = {}
35
 
36
 
37
- class InternLMTokenizer(PreTrainedTokenizer):
 
38
  """
39
- Construct a InternLM tokenizer. Based on byte-level Byte-Pair-Encoding.
40
 
41
  Args:
42
  vocab_file (`str`):
@@ -79,8 +77,6 @@ class InternLMTokenizer(PreTrainedTokenizer):
79
  **kwargs,
80
  )
81
 
82
- """ Initialization"""
83
-
84
  @property
85
  def no_prefix_space_tokens(self):
86
  if self._no_prefix_space_tokens is None:
 
1
  # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
  #
4
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
 
 
 
5
  #
6
  # Licensed under the Apache License, Version 2.0 (the "License");
7
  # you may not use this file except in compliance with the License.
 
15
  # See the License for the specific language governing permissions and
16
  # limitations under the License.
17
 
18
+ """Tokenization classes for InternLM."""
19
  import os
20
  from shutil import copyfile
21
  from typing import Any, Dict, List, Optional, Tuple
 
31
  PRETRAINED_VOCAB_FILES_MAP = {}
32
 
33
 
34
+ # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
35
+ class InternLM2Tokenizer(PreTrainedTokenizer):
36
  """
37
+ Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
38
 
39
  Args:
40
  vocab_file (`str`):
 
77
  **kwargs,
78
  )
79
 
 
 
80
  @property
81
  def no_prefix_space_tokens(self):
82
  if self._no_prefix_space_tokens is None:
tokenization_internlm2_fast.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """Tokenization Fast class for InternLM."""
19
+ import os
20
+ from shutil import copyfile
21
+ from typing import Any, Dict, Optional, Tuple
22
+
23
+ from tokenizers import processors, decoders, Tokenizer, normalizers
24
+ from tokenizers.models import BPE
25
+
26
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
27
+ from transformers.utils import logging
28
+
29
+ from transformers.convert_slow_tokenizer import (
30
+ SLOW_TO_FAST_CONVERTERS,
31
+ SpmConverter,
32
+ SentencePieceExtractor,
33
+ )
34
+
35
+ from .tokenization_internlm2 import InternLM2Tokenizer
36
+
37
+ logger = logging.get_logger(__name__)
38
+
39
+ VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
40
+
41
+ # Modified from transformers.convert_slow_tokenizer.LlamaConverter
42
+ class InternLM2Converter(SpmConverter):
43
+ handle_byte_fallback = True
44
+
45
+ def vocab(self, proto):
46
+ vocab = [
47
+ ("<unk>", 0.0),
48
+ ("<s>", 0.0),
49
+ ("</s>", 0.0),
50
+ ]
51
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
52
+ return vocab
53
+
54
+ def unk_id(self, proto):
55
+ unk_id = 0
56
+ return unk_id
57
+
58
+ def decoder(self, replacement, add_prefix_space):
59
+ return decoders.Sequence(
60
+ [
61
+ decoders.Replace("▁", " "),
62
+ decoders.ByteFallback(),
63
+ decoders.Fuse(),
64
+ decoders.Strip(content=" ", left=1),
65
+ ]
66
+ )
67
+
68
+ def tokenizer(self, proto):
69
+ model_type = proto.trainer_spec.model_type
70
+ vocab_scores = self.vocab(proto)
71
+ # special tokens
72
+ added_tokens = self.original_tokenizer.added_tokens_decoder
73
+ for i in range(len(vocab_scores)):
74
+ piece, score = vocab_scores[i]
75
+ if i in added_tokens:
76
+ vocab_scores[i] = (added_tokens[i].content, score)
77
+ if model_type == 1:
78
+ raise RuntimeError("InternLM2 is supposed to be a BPE model!")
79
+
80
+ elif model_type == 2:
81
+ _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
82
+ bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
83
+ tokenizer = Tokenizer(
84
+ BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
85
+ )
86
+ tokenizer.add_special_tokens(
87
+ [ added_token for index, added_token in added_tokens.items()]
88
+ )
89
+ else:
90
+ raise Exception(
91
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
92
+ )
93
+
94
+ return tokenizer
95
+
96
+ def normalizer(self, proto):
97
+ normalizers_list = []
98
+ if proto.normalizer_spec.add_dummy_prefix:
99
+ normalizers_list.append(normalizers.Prepend(prepend="▁"))
100
+ normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
101
+ return normalizers.Sequence(normalizers_list)
102
+
103
+ def pre_tokenizer(self, replacement, add_prefix_space):
104
+ return None
105
+
106
+ SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
107
+
108
+
109
+ # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
110
+ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
111
+ vocab_files_names = VOCAB_FILES_NAMES
112
+ slow_tokenizer_class = InternLM2Tokenizer
113
+ padding_side = "left"
114
+ model_input_names = ["input_ids", "attention_mask"]
115
+ _auto_class = "AutoTokenizer"
116
+
117
+ def __init__(
118
+ self,
119
+ vocab_file,
120
+ unk_token="<unk>",
121
+ bos_token="<s>",
122
+ eos_token="</s>",
123
+ pad_token="</s>",
124
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
125
+ add_bos_token=True,
126
+ add_eos_token=False,
127
+ decode_with_prefix_space=False,
128
+ clean_up_tokenization_spaces=False,
129
+ **kwargs,
130
+ ):
131
+ super().__init__(
132
+ vocab_file=vocab_file,
133
+ unk_token=unk_token,
134
+ bos_token=bos_token,
135
+ eos_token=eos_token,
136
+ pad_token=pad_token,
137
+ sp_model_kwargs=sp_model_kwargs,
138
+ add_bos_token=add_bos_token,
139
+ add_eos_token=add_eos_token,
140
+ decode_with_prefix_space=decode_with_prefix_space,
141
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
142
+ **kwargs,
143
+ )
144
+ self._add_bos_token = add_bos_token
145
+ self._add_eos_token = add_eos_token
146
+ self.update_post_processor()
147
+ self.vocab_file = vocab_file
148
+
149
+ @property
150
+ def can_save_slow_tokenizer(self) -> bool:
151
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
152
+
153
+ def update_post_processor(self):
154
+ """
155
+ Updates the underlying post processor with the current `bos_token` and `eos_token`.
156
+ """
157
+ bos = self.bos_token
158
+ bos_token_id = self.bos_token_id
159
+ if bos is None and self.add_bos_token:
160
+ raise ValueError("add_bos_token = True but bos_token = None")
161
+
162
+ eos = self.eos_token
163
+ eos_token_id = self.eos_token_id
164
+ if eos is None and self.add_eos_token:
165
+ raise ValueError("add_eos_token = True but eos_token = None")
166
+
167
+ single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
168
+ pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
169
+
170
+ special_tokens = []
171
+ if self.add_bos_token:
172
+ special_tokens.append((bos, bos_token_id))
173
+ if self.add_eos_token:
174
+ special_tokens.append((eos, eos_token_id))
175
+ self._tokenizer.post_processor = processors.TemplateProcessing(
176
+ single=single, pair=pair, special_tokens=special_tokens
177
+ )
178
+
179
+ @property
180
+ def add_eos_token(self):
181
+ return self._add_eos_token
182
+
183
+ @property
184
+ def add_bos_token(self):
185
+ return self._add_bos_token
186
+
187
+ @add_eos_token.setter
188
+ def add_eos_token(self, value):
189
+ self._add_eos_token = value
190
+ self.update_post_processor()
191
+
192
+ @add_bos_token.setter
193
+ def add_bos_token(self, value):
194
+ self._add_bos_token = value
195
+ self.update_post_processor()
196
+
197
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
198
+ if not self.can_save_slow_tokenizer:
199
+ raise ValueError(
200
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
201
+ "tokenizer."
202
+ )
203
+
204
+ if not os.path.isdir(save_directory):
205
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
206
+ return
207
+ out_vocab_file = os.path.join(
208
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
209
+ )
210
+
211
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
212
+ copyfile(self.vocab_file, out_vocab_file)
213
+
214
+ return (out_vocab_file,)
tokenizer_config.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "auto_map": {
3
  "AutoTokenizer": [
4
- "tokenization_internlm.InternLMTokenizer",
5
- null
6
  ]
7
  },
8
  "bos_token": "<s>",
@@ -10,7 +10,7 @@
10
  "eos_token": "</s>",
11
  "model_max_length": 1000000000000000019884624838656,
12
  "pad_token": "</s>",
13
- "tokenizer_class": "InternLMTokenizer",
14
  "unk_token": "<unk>",
15
  "added_tokens_decoder": {
16
  "0": {
 
1
  {
2
  "auto_map": {
3
  "AutoTokenizer": [
4
+ "tokenization_internlm2.InternLM2Tokenizer",
5
+ "tokenization_internlm2_fast.InternLM2TokenizerFast"
6
  ]
7
  },
8
  "bos_token": "<s>",
 
10
  "eos_token": "</s>",
11
  "model_max_length": 1000000000000000019884624838656,
12
  "pad_token": "</s>",
13
+ "tokenizer_class": "InternLM2Tokenizer",
14
  "unk_token": "<unk>",
15
  "added_tokens_decoder": {
16
  "0": {