Curt-Park commited on
Commit
0606cc6
·
1 Parent(s): d58794f

Clip the exceeded value

Browse files
model_repository/postprocessing/1/utils/gpt_token_encoder.py CHANGED
@@ -152,7 +152,7 @@ class Encoder:
152
 
153
  def decode(self, tokens):
154
  text = "".join(
155
- [self.decoder[token] for token in tokens]
156
  )
157
  text = bytearray([self.byte_decoder[c] for c in text]).decode(
158
  "utf-8", errors=self.errors
 
152
 
153
  def decode(self, tokens):
154
  text = "".join(
155
+ [self.decoder[min(token, 50256)] for token in tokens]
156
  )
157
  text = bytearray([self.byte_decoder[c] for c in text]).decode(
158
  "utf-8", errors=self.errors
model_repository/preprocessing/1/model.py CHANGED
@@ -11,11 +11,6 @@ from torch.nn.utils.rnn import pad_sequence
11
  from transformers import AutoTokenizer
12
  from word_list import to_word_list_format
13
 
14
- # GPT3 Related variables
15
- # Reference:
16
- # https://github.com/NVIDIA/FasterTransformer/blob/main/sample/pytorch/gpt_sample.py
17
- MERGES_FILE = "gpt2-merges.txt"
18
- VOCAB_FILE = "gpt2-vocab.json"
19
 
20
  START_ID = 50256
21
  END_ID = 50256
 
11
  from transformers import AutoTokenizer
12
  from word_list import to_word_list_format
13
 
 
 
 
 
 
14
 
15
  START_ID = 50256
16
  END_ID = 50256