Fix postprocess with autotokenizer

Browse files

Files changed (9) hide show

model_repository/postprocessing/1/gpt2-merges.txt +0 -0
model_repository/postprocessing/1/gpt2-vocab.json +0 -0
model_repository/postprocessing/1/model.py +9 -18
model_repository/postprocessing/1/utils/__init__.py +0 -13
model_repository/postprocessing/1/utils/gpt_token_encoder.py +0 -172
model_repository/preprocessing/1/model.py +4 -7
model_repository/preprocessing/1/utils/__init__.py +0 -13
model_repository/preprocessing/1/utils/gpt_token_encoder.py +0 -170
model_repository/preprocessing/1/word_list.py +0 -56

model_repository/postprocessing/1/gpt2-merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

model_repository/postprocessing/1/gpt2-vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

model_repository/postprocessing/1/model.py CHANGED Viewed

@@ -5,15 +5,7 @@ from typing import Any, Dict, List
 import numpy as np
 import triton_python_backend_utils as pb_utils
-import utils.gpt_token_encoder as encoder
-# GPT3 Related variables
-# Reference:
-#   https://github.com/NVIDIA/FasterTransformer/blob/main/sample/pytorch/gpt_sample.py
-MERGES_FILE = "gpt2-merges.txt"
-VOCAB_FILE = "gpt2-vocab.json"
-MAX_BATCH_SIZE = 8
 class TritonPythonModel:
@@ -24,8 +16,6 @@ class TritonPythonModel:
         Implementing `initialize` function is optional. This function allows
         the model to initialize any state associated with this model.
-        Parameters
         Args:
           Both keys and values are strings. The dictionary keys and values are:
           * model_config: A JSON string containing the model configuration
@@ -44,6 +34,13 @@ class TritonPythonModel:
         # Convert Triton types to numpy types
         self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
     def execute(
         self, requests: List["pb_utils.InferenceRequest"]
     ) -> List["pb_utils.InferenceResponse"]:
@@ -115,14 +112,8 @@ class TritonPythonModel:
     def _postprocessing(self, tokens_batch: np.ndarray) -> List[bytes]:
         """Postprocess."""
-        cur_folder = Path(__file__).parent
-        enc = encoder.get_encoder(
-            str(cur_folder / VOCAB_FILE), str(cur_folder / MERGES_FILE)
-        )
         outputs = []
         for beam_tokens in tokens_batch:
             for tokens in beam_tokens:
-                output = enc.decode(tokens)
-                outputs.append(output.encode("utf8"))
         return outputs

 import numpy as np
 import triton_python_backend_utils as pb_utils
+from transformers import AutoTokenizer
 class TritonPythonModel:
         Implementing `initialize` function is optional. This function allows
         the model to initialize any state associated with this model.
         Args:
           Both keys and values are strings. The dictionary keys and values are:
           * model_config: A JSON string containing the model configuration
         # Convert Triton types to numpy types
         self.output_dtype = pb_utils.triton_string_to_numpy(output_config["data_type"])
+        # Init a tokenizer for postprocessing.
+        cur_folder = Path(__file__).parent
+        cache_dir = cur_folder / ".cache"
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            "Salesforce/codegen-350M-mono", cache_dir=cache_dir
+        )
     def execute(
         self, requests: List["pb_utils.InferenceRequest"]
     ) -> List["pb_utils.InferenceResponse"]:
     def _postprocessing(self, tokens_batch: np.ndarray) -> List[bytes]:
         """Postprocess."""
         outputs = []
         for beam_tokens in tokens_batch:
             for tokens in beam_tokens:
+                outputs.append(self.tokenizer.decode(tokens))
         return outputs

model_repository/postprocessing/1/utils/__init__.py DELETED Viewed

@@ -1,13 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

model_repository/postprocessing/1/utils/gpt_token_encoder.py DELETED Viewed

@@ -1,172 +0,0 @@
-"""Byte pair encoding utilities"""
-# Modified MIT License
-# Software Copyright (c) 2019 OpenAI
-# We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
-# We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-# associated documentation files (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-# The above copyright notice and this permission notice need not be included
-# with content created by the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-# OR OTHER DEALINGS IN THE SOFTWARE.
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-from functools import lru_cache
-import regex as re
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-class Encoder:
-    def __init__(self, encoder, bpe_merges, errors="replace"):
-        self.encoder = encoder
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(
-            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-        )
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-        if not pairs:
-            return token
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-    def encode(self, text):
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
-            bpe_tokens.extend(
-                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
-            )
-        return bpe_tokens
-    def decode(self, tokens):
-        text = "".join(
-            [self.decoder[min(token, 50256)] for token in tokens]
-        )
-        text = bytearray([self.byte_decoder[c] for c in text]).decode(
-            "utf-8", errors=self.errors
-        )
-        return text
-def get_encoder(vocab_file, bpe_file):
-    with open(vocab_file, "r") as f:
-        encoder = json.load(f)
-    with open(bpe_file, "r", encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )

model_repository/preprocessing/1/model.py CHANGED Viewed

@@ -9,12 +9,8 @@ import torch
 import triton_python_backend_utils as pb_utils
 from torch.nn.utils.rnn import pad_sequence
 from transformers import AutoTokenizer
-from word_list import to_word_list_format
-START_ID = 50256
 END_ID = 50256
-MAX_BATCH_SIZE = 8
 class TritonPythonModel:
@@ -102,8 +98,8 @@ class TritonPythonModel:
             # Preprocessing input data.
             input_id, request_input_len = self._create_request(query)
-            bad_words = to_word_list_format(bad_words_dict)
-            stop_words = to_word_list_format(stop_words_dict)
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
@@ -165,7 +161,8 @@ class TritonPythonModel:
         return start_ids, start_lengths
-    def _create_word_list(self, word_dict: Dict[str, Any]) -> np.ndarray:
         flat_ids = []
         offsets = []
         for word_dict_item in word_dict:

 import triton_python_backend_utils as pb_utils
 from torch.nn.utils.rnn import pad_sequence
 from transformers import AutoTokenizer
 END_ID = 50256
 class TritonPythonModel:
             # Preprocessing input data.
             input_id, request_input_len = self._create_request(query)
+            bad_words = self._create_word_list(bad_words_dict)
+            stop_words = self._create_word_list(stop_words_dict)
             # Create output tensors. You need pb_utils.Tensor
             # objects to create pb_utils.InferenceResponse.
         return start_ids, start_lengths
+    def _create_word_list(self, word_dict: np.ndarray) -> np.ndarray:
+        """Encode the word list."""
         flat_ids = []
         offsets = []
         for word_dict_item in word_dict:

model_repository/preprocessing/1/utils/__init__.py DELETED Viewed

@@ -1,13 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.

model_repository/preprocessing/1/utils/gpt_token_encoder.py DELETED Viewed

@@ -1,170 +0,0 @@
-"""Byte pair encoding utilities"""
-# Modified MIT License
-# Software Copyright (c) 2019 OpenAI
-# We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
-# We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-# associated documentation files (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-# The above copyright notice and this permission notice need not be included
-# with content created by the Software.
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-# OR OTHER DEALINGS IN THE SOFTWARE.
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-from functools import lru_cache
-import regex as re
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = (
-        list(range(ord("!"), ord("~") + 1))
-        + list(range(ord("¡"), ord("¬") + 1))
-        + list(range(ord("®"), ord("ÿ") + 1))
-    )
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-class Encoder:
-    def __init__(self, encoder, bpe_merges, errors="replace"):
-        self.encoder = encoder
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(
-            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-        )
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-        if not pairs:
-            return token
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = " ".join(word)
-        self.cache[token] = word
-        return word
-    def encode(self, text):
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
-            bpe_tokens.extend(
-                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
-            )
-        return bpe_tokens
-    def decode(self, tokens):
-        text = "".join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c] for c in text]).decode(
-            "utf-8", errors=self.errors
-        )
-        return text
-def get_encoder(vocab_file, bpe_file):
-    with open(vocab_file, "r") as f:
-        encoder = json.load(f)
-    with open(bpe_file, "r", encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [tuple(merge_str.split()) for merge_str in bpe_data.split("\n")[1:-1]]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )

model_repository/preprocessing/1/word_list.py DELETED Viewed

@@ -1,56 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import csv
-from pathlib import Path
-import numpy as np
-from transformers import AutoTokenizer
-def to_word_list_format(word_dict):
-    cache_dir = Path(__file__).parent / ".cache"
-    tokenizer = AutoTokenizer.from_pretrained(
-        "Salesforce/codegen-350M-mono", cache_dir=cache_dir
-    )
-    flat_ids = []
-    offsets = []
-    for word_dict_item in word_dict:
-        item_flat_ids = []
-        item_offsets = []
-        if isinstance(word_dict_item[0], bytes):
-            word_dict_item = [word_dict_item[0].decode()]
-        words = list(csv.reader(word_dict_item))[0]
-        for word in words:
-            ids = tokenizer.encode(word)
-            if len(ids) == 0:
-                continue
-            item_flat_ids += ids
-            item_offsets.append(len(ids))
-        flat_ids.append(np.array(item_flat_ids))
-        offsets.append(np.cumsum(np.array(item_offsets)))
-    pad_to = max(1, max(len(ids) for ids in flat_ids))
-    for i, (ids, offs) in enumerate(zip(flat_ids, offsets)):
-        flat_ids[i] = np.pad(ids, (0, pad_to - len(ids)), constant_values=0)
-        offsets[i] = np.pad(offs, (0, pad_to - len(offs)), constant_values=-1)
-    return np.array([flat_ids, offsets], dtype="int32").transpose((1, 0, 2))