Curt-Park
commited on
Commit
·
0606cc6
1
Parent(s):
d58794f
Clip the exceeded value
Browse files
model_repository/postprocessing/1/utils/gpt_token_encoder.py
CHANGED
@@ -152,7 +152,7 @@ class Encoder:
|
|
152 |
|
153 |
def decode(self, tokens):
|
154 |
text = "".join(
|
155 |
-
[self.decoder[token] for token in tokens]
|
156 |
)
|
157 |
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
158 |
"utf-8", errors=self.errors
|
|
|
152 |
|
153 |
def decode(self, tokens):
|
154 |
text = "".join(
|
155 |
+
[self.decoder[min(token, 50256)] for token in tokens]
|
156 |
)
|
157 |
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
158 |
"utf-8", errors=self.errors
|
model_repository/preprocessing/1/model.py
CHANGED
@@ -11,11 +11,6 @@ from torch.nn.utils.rnn import pad_sequence
|
|
11 |
from transformers import AutoTokenizer
|
12 |
from word_list import to_word_list_format
|
13 |
|
14 |
-
# GPT3 Related variables
|
15 |
-
# Reference:
|
16 |
-
# https://github.com/NVIDIA/FasterTransformer/blob/main/sample/pytorch/gpt_sample.py
|
17 |
-
MERGES_FILE = "gpt2-merges.txt"
|
18 |
-
VOCAB_FILE = "gpt2-vocab.json"
|
19 |
|
20 |
START_ID = 50256
|
21 |
END_ID = 50256
|
|
|
11 |
from transformers import AutoTokenizer
|
12 |
from word_list import to_word_list_format
|
13 |
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
START_ID = 50256
|
16 |
END_ID = 50256
|