deepseek-ai
/

DeepSeek-Coder-V2-Lite-Base

@@ -1,328 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Forked from the file src/transformers/models/bert_generation/tokenization_bert_generation.py from the HuggingFace Transformers library.
-Permalink: https://github.com/huggingface/transformers/blob/04ab5605fbb4ef207b10bf2772d88c53fc242e83/src/transformers/models/bert_generation/tokenization_bert_generation.py
-Tokenizer class for ReplitLM
-Class is modified for compatibility with custom vocabulary and to achieve desired encode/decode behavior for Replit Code V1 3B model.
-"""
-import os
-import sentencepiece as spm
-from sentencepiece import SentencePieceProcessor
-from shutil import copyfile
-from transformers import PreTrainedTokenizer
-from typing import Any, Dict, List, Optional, Tuple
-import base64
-VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
-class Tokenizer:
-    def __init__(self, model_path="/weka-jd/prod/deepseek/permanent/shared/mingchuan/llama_data/tokenizer.model"):
-        # reload tokenizer
-        assert os.path.isfile(model_path), model_path
-        self.sp_model = SentencePieceProcessor(model_file=model_path)
-        # # ? print spm for debugging
-        # spm_proto = sp_pb2_model.ModelProto()
-        # spm_proto.ParseFromString(self.sp_model.serialized_model_proto())
-        # print(dir(spm_proto))
-        # attrs = ['denormalizer_spec', 'normalizer_spec', 'trainer_spec']
-        # print('=======' * 5)
-        # for attr in attrs:
-        #     print('=======', attr, '=======')
-        #     print(getattr(spm_proto, attr))
-        # BOS / EOS token IDs
-        self.n_words: int = self.sp_model.vocab_size()
-        self.bos_id: int = self.sp_model.bos_id()
-        self.eos_id: int = self.sp_model.eos_id()
-        self.pad_id: int = self.sp_model.pad_id()
-        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        assert type(s) is str
-        t = self.sp_model.encode(s)
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-    def decode(self, t: List[int]) -> str:
-        return self.sp_model.decode(t)
-class LineBBPETokenizer(Tokenizer):
-    def __init__(self,
-                 model_path="/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/spm_0717_final/100000/bbpe_full_bytes.model",
-                 ignore_decode_err=False, attachfile_path=None):
-        super().__init__(model_path=model_path)
-        self.ignore_decode_err = ignore_decode_err
-        Bvocab_path = attachfile_path + "/byteVocab.txt"
-        #'/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/byteVocab.txt'
-        punct_path = attachfile_path + "/all_punct.txt"
-        #punct_path = '/3fs-jd/prod/deepseek/shared/daidamai/data/bbpe/all_punct.txt'
-        Bvocab = open(Bvocab_path, 'r', encoding = 'utf-8')
-        self.punct = []
-        with open(punct_path, 'r', encoding='utf-8') as f:
-            lines = f.readlines()
-            for line in lines:
-                line = line.strip()
-                if line:
-                    self.punct.append(line)
-        self.numchars = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
-        self.white_space = [' ']
-        self.special_chars = set(self.numchars) | set(self.punct) | set(self.white_space)
-        # ! remove chars that will be encoded to 0 (unk_id)
-        unk_ch = set()
-        for ch in self.special_chars:
-            ids = self.sp_model.encode(ch)
-            if 0 in ids:
-                unk_ch.update(ch)
-        self.special_chars = self.special_chars - unk_ch
-        self.byte2ch = [-1] * 256
-        self.ch2byte = {}
-        for line in list(Bvocab.readlines())[:256]:
-            tokens = line.strip().split('\t')
-            self.byte2ch[int(tokens[0])] = tokens[1]
-            self.ch2byte[tokens[1]] = int(tokens[0])
-        self.b16_dec = {}
-        self.b16_enc = ['x'] * 16
-        for i in range(10):
-            self.b16_dec[str(i)] = i
-            self.b16_enc[i] = str(i)
-        self.b16_dec['A'] = 10
-        self.b16_dec['B'] = 11
-        self.b16_dec['C'] = 12
-        self.b16_dec['D'] = 13
-        self.b16_dec['E'] = 14
-        self.b16_dec['F'] = 15
-        self.b16_enc[10] = 'A'
-        self.b16_enc[11] = 'B'
-        self.b16_enc[12] = 'C'
-        self.b16_enc[13] = 'D'
-        self.b16_enc[14] = 'E'
-        self.b16_enc[15] = 'F'
-        self.new_line_id = self.sp_model.encode(self.mapping_raw_to_256ch('\n'))[-1]
-    def base16encode(self, n):
-        return self.b16_enc[n // 16] + self.b16_enc[n % 16]
-    def base16decode(self, s):
-        return self.b16_dec[s[0]] * 16 + self.b16_dec[s[1]]
-    def mapping_raw_to_256ch(self, s: str) -> str:
-        mapped_s = []
-        for token in s:
-            if token in self.special_chars:
-                mapped_s.append(token)
-                continue
-            tk = str(base64.b16encode(token.encode("utf-8")))[2:-1]
-            num = len(tk) // 2
-            for i in range(num):
-                mapped_s.append(self.byte2ch[(self.base16decode(tk[2*i:2*i+2]))])
-        return ''.join(mapped_s)
-    def mapping_256ch_to_raw(self, s: str) -> str:
-        mapped_s = ''
-        for token in s:
-            if token in self.ch2byte:
-                mapped_s += self.base16encode(self.ch2byte[token])
-            else:
-                mapped_s += str(base64.b16encode(token.encode("utf-8")))[2:-1]
-        # decode utf-8 string to text string
-        byte_s = bytes.fromhex(mapped_s)
-        if self.ignore_decode_err:
-            try:
-                mapped_s = byte_s.decode('utf-8')
-            except UnicodeDecodeError:
-                mapped_s = ''
-        else:
-            mapped_s = byte_s.decode('utf-8')
-        return mapped_s
-    def encode_line(self, s):
-        if s == '\n':
-            return [self.new_line_id]
-        ss = self.mapping_raw_to_256ch(s)
-        t = self.sp_model.encode(ss)
-        return t
-    def encode(self, s: str, bos: bool, eos: bool) -> List[int]:
-        assert type(s) is str
-        t = []
-        lines = s.split('\n')
-        n_lines = len(lines)
-        for i in range(n_lines):
-            if i != n_lines - 1:
-                line = lines[i] + '\n'
-            else:
-                line = lines[i]
-            tt = self.encode_line(line)
-            t += tt
-        if bos:
-            t = [self.bos_id] + t
-        if eos:
-            t = t + [self.eos_id]
-        return t
-    def get_restored_white_space(self, t):
-        t = t[:3]
-        if t[0] == self.bos_id:
-            t = t[1:]
-        decoded = self.sp_model.decode(t)
-        encoded = self.sp_model.encode(decoded)
-        if len(encoded) < len(t):
-            return ' '
-        else:
-            return ''
-    def decode_line(self, t):
-        if len(t) == 1 and t[0] == self.new_line_id:
-            return '\n'
-        # ? special bug fixing for a single whitespace in the line beginning, sentencepiece will consume it, we restore it
-        restored_white_space = self.get_restored_white_space(t)
-        ss = self.sp_model.decode(t)
-        s = restored_white_space + self.mapping_256ch_to_raw(ss)
-        return s
-    def decode(self, t: List[int]) -> str:
-        s = ''
-        new_line_indices = [index for index, value in enumerate(t) if value == self.new_line_id]
-        last_idx = 0
-        for i in range(len(new_line_indices)):
-            line_id = t[last_idx:new_line_indices[i] + 1]
-            ss = self.decode_line(line_id)
-            s += ss
-            last_idx = new_line_indices[i] + 1
-        if last_idx < len(t):
-            line_id = t[last_idx:]
-            ss = self.decode_line(line_id)
-            s += ss
-        return s
-    def add_special(self, special_tokens):
-        '''
-        add special tokens to the tokenizer
-        '''
-        spm_proto = sp_pb2_model.ModelProto()
-        spm_proto.ParseFromString(self.sp_model.serialized_model_proto())
-        for special_token in special_tokens:
-            new_p = sp_pb2_model.ModelProto().SentencePiece()
-            new_p.piece = self.mapping_raw_to_256ch(special_token)
-            new_p.score = 0.0
-            new_p.type = 4
-            spm_proto.pieces.append(new_p)
-            print(f'special token added: {special_token}')
-        self.sp_model.LoadFromSerializedProto(spm_proto.SerializeToString())
-class DeepSeekTokenizer(PreTrainedTokenizer):
-    """
-      Construct a ReplitLMTokenizer tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
-      This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods.
-      Args:
-          vocab_file (`str`):
-              [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
-              contains the vocabulary necessary to instantiate a tokenizer.
-          eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
-              The end of sequence token.
-          bos_token (`str`, *optional*, defaults to `None`):
-              The begin of sequence token.
-          unk_token (`str`, *optional*, defaults to `"<|unk|>"`):
-              The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-              token instead.
-          pad_token (`str`, *optional*, defaults to `"<|pad|>"`):
-              The token used for padding, for example when batching sequences of different lengths.
-          sp_model_kwargs (`dict`, *optional*):
-              Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
-              SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
-              to set:
-              - `enable_sampling`: Enable subword regularization.
-              - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
-                - `nbest_size = {0,1}`: No sampling is performed.
-                - `nbest_size > 1`: samples from the nbest_size results.
-                - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
-                  using forward-filtering-and-backward-sampling algorithm.
-              - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
-                BPE-dropout.
-      """
-    vocab_files_names = VOCAB_FILES_NAMES
-    prefix_tokens: List[int] = []
-    model_input_names = ['input_ids', 'attention_mask']
-    def __init__(self, vocab_file, bos_token="<s>", eos_token='</s>', unk_token=None, pad_token=None, sep_token='</s>', sp_model_kwargs: Optional[Dict[str, Any]]=None, name_or_path=None, **kwargs) -> None:
-        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
-        super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, sp_model_kwargs=self.sp_model_kwargs, **kwargs)
-        #obtain the current directory of py
-        vocab_path = name_or_path
-        print("vocab_path: ", vocab_path)
-        self.vocab_path = vocab_path
-        self.vocab_file = vocab_path + '/tokenizer.model'
-        self.token = LineBBPETokenizer(model_path=self.vocab_file, attachfile_path=vocab_path, ignore_decode_err=True)
-    @property
-    def vocab_size(self):
-        return self.token.sp_model.get_piece_size()
-    def get_vocab(self):
-        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state['token'] = None
-        return state
-    def __setstate__(self, d):
-        self.__dict__ = d
-        if not hasattr(self, 'sp_model_kwargs'):
-            self.sp_model_kwargs = {}
-        self.token = LineBBPETokenizer(model_path=self.vocab_file, attachfile_path=self.vocab_path)
-    def _tokenize(self, text: str) -> List[str]:
-        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
-        token_ids = self.token.encode(text, bos=True, eos=False)
-        string_tokens = [self._convert_id_to_token(token_id) for token_id in token_ids]
-        return string_tokens
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.token.sp_model.piece_to_id(token)
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        token = self.token.sp_model.id_to_piece(index)
-        return token
-    def convert_tokens_to_string(self, tokens):
-        """Converts a sequence of tokens (string) in a single string."""
-        ids = [self._convert_token_to_id(token) for token in tokens]
-        return self.token.decode(ids)
-    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str]=None) -> Tuple[str]:
-        if not os.path.isdir(save_directory):
-            raise ValueError(f'Vocabulary path ({save_directory}) should be a directory')
-        out_vocab_file = os.path.join(save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file'])
-        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
-            copyfile(self.vocab_file, out_vocab_file)
-        elif not os.path.isfile(self.vocab_file):
-            with open(out_vocab_file, 'wb') as fi:
-                content_spiece_model = self.sp_model.serialized_model_proto()
-                fi.write(content_spiece_model)
-        return (out_vocab_file,)