# coding=utf-8 # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for Phi-4.""" import base64 import os from functools import cached_property import re from typing import Collection, Dict, List, Optional, Set, Tuple, Union import requests import tiktoken from transformers import AddedToken, AutoConfig, PreTrainedTokenizer from transformers.models.auto.tokenization_auto import get_tokenizer_config PADDED_VOCAB_SIZE = 100352 VOCAB_SIZE = 100276 VOCAB_FILES_NAMES = {"vocab_file": "cl100k_base.tiktoken"} DUMMY_TOKENS = {f"<|dummy_{12 + offset}|>": VOCAB_SIZE + offset for offset in range(1, PADDED_VOCAB_SIZE - VOCAB_SIZE)} SPECIAL_TOKENS = { "<|dummy_0|>": 100256, "<|endoftext|>": 100257, "<|fim_prefix|>": 100258, "<|fim_middle|>": 100259, "<|fim_suffix|>": 100260, "<|dummy_1|>": 100261, "<|dummy_2|>": 100262, "<|dummy_3|>": 100263, "<|im_start|>": 100264, "<|im_end|>": 100265, "<|im_sep|>": 100266, "<|dummy_4|>": 100267, "<|dummy_5|>": 100268, "<|dummy_6|>": 100269, "<|dummy_7|>": 100270, "<|dummy_8|>": 100271, "<|dummy_9|>": 100272, "<|dummy_10|>": 100273, "<|dummy_11|>": 100274, "<|dummy_12|>": 100275, "<|endofprompt|>": 100276, **DUMMY_TOKENS, } def _load_tiktoken_bpe(tiktoken_bpe_file: str) -> Dict[bytes, int]: with open(tiktoken_bpe_file, "rb") as f: contents = f.read() return { base64.b64decode(token): int(rank) for token, rank in (line.split() for line in contents.splitlines() if line) } class Phi4Tokenizer(PreTrainedTokenizer): """ Construct a Phi-4 tokenizer based on Titoken. Args: vocab_file (`str`, *optional*, defaults to `None`): Path to the vocabulary file. errors (`str`, *optional*, defaults to `'replace'`): How to handle errors with the tokenizer. Can be `'replace'`, `'ignore'` or `'raise'`. """ vocab_files_names = VOCAB_FILES_NAMES model_input_names: List[str] = ["input_ids", "attention_mask"] padding_side = "left" def __init__(self, vocab_file: Optional[str] = None, errors: str = "replace", **kwargs) -> None: # `PreTrainedTokenizer.__init__()` calls `_add_tokens()` which checks if # the token is present in `self.special_tokens`. Thus, we instantiate it before to ensure # that the special tokens are present in `self.special_tokens`. self.special_tokens = SPECIAL_TOKENS self.errors = errors super().__init__(**kwargs) try: base = tiktoken.get_encoding("cl100k_base") except requests.RequestException: import hashlib from transformers.utils import cached_file cached_tokenizer_path = cached_file( "microsoft/phi-4", "cl100k_base.tiktoken", _raise_exceptions_for_gated_repo=False, _raise_exceptions_for_missing_entries=False, _raise_exceptions_for_connection_errors=False, ) tiktoken_cache_dir = os.path.dirname(cached_tokenizer_path) tiktoken_cache_path = os.path.join( tiktoken_cache_dir, hashlib.sha1( "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken".encode() ).hexdigest(), ) if not os.path.exists(tiktoken_cache_path): os.rename(cached_tokenizer_path, tiktoken_cache_path) os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir base = tiktoken.get_encoding("cl100k_base") if vocab_file is None: self.mergeable_ranks = base._mergeable_ranks else: self.mergeable_ranks = _load_tiktoken_bpe(vocab_file) self.pat_str = base._pat_str self.tokenizer = tiktoken.Encoding( name="phi4", pat_str=self.pat_str, mergeable_ranks=self.mergeable_ranks, special_tokens=self.special_tokens, ) self.decoder: Dict[int, bytes] = {v: k for k, v in self.mergeable_ranks.items()} self.decoder.update({v: k for k, v in self.special_tokens.items()}) self.eod_id = self.tokenizer.eot_token self._eos_token = self._convert_id_to_token(self.eod_id) self._bos_token = self._eos_token def __getstate__(self) -> Dict[str, Union[str, bytes, int]]: state = self.__dict__.copy() del state["tokenizer"] return state def __setstate__(self, state: Dict[str, Union[str, bytes, int]]) -> None: self.__dict__ = state self.tokenizer = tiktoken.Encoding( name="phi4", pat_str=self.pat_str, mergeable_ranks=self.mergeable_ranks, special_tokens=self.special_tokens, ) def __len__(self) -> int: return self.tokenizer.n_vocab @cached_property def dummy_token_indices(self) -> List[int]: # Some additional tokens which are not used are considered as dummy tokens additional_tokens = ["<|fim_prefix|>", "<|fim_middle|>", "<|fim_suffix|>", "<|endofprompt|>"] dummy_token_indices = [index for token, index in self.special_tokens.items() if "dummy_id" in token] dummy_token_indices.extend([self.special_tokens[token] for token in additional_tokens]) return sorted(dummy_token_indices) @property def vocab_size(self) -> int: return self.tokenizer.n_vocab @property def eos_token_id(self) -> int: return self.eod_id @classmethod def from_pretrained( cls, pretrained_model_name_or_path: Union[str, os.PathLike], *args, **kwargs, ) -> "Phi4Tokenizer": cls_kwargs = kwargs tokenization_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) if tokenization_config: cls_kwargs = {**tokenization_config, **cls_kwargs} else: config = AutoConfig.from_pretrained(pretrained_model_name_or_path, trust_remote_code=True) cls_kwargs["model_max_length"] = config.max_position_embeddings return cls(**cls_kwargs) def _add_tokens( self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False, ) -> int: if not special_tokens and new_tokens: raise ValueError("Only special tokens can be added to this tokenizer") for token in new_tokens: surface_form = token.content if isinstance(token, AddedToken) else token if surface_form not in self.special_tokens: raise ValueError( "For now, we do not support unknown special tokens\n" "In the future, if there is a need for this, we can add special tokens to the tokenizer\n" "starting from rank 100261 - 100263 and then 100266 - 100275.\n" "And finally, we can re-construct the enc object back\n" ) return 0 def _strip_special_tokens(self, text: str) -> str: for special_token in self.special_tokens: pattern = rf"[ \r\n]*{re.escape(special_token)}[ \r\n]*" text = re.sub(pattern, special_token, text) return text def _convert_id_to_token(self, index: int) -> Union[bytes, str]: if index in self.decoder: return self.decoder[index] return "<|dummy_0|>" def _convert_token_to_id(self, token: Union[bytes, str]) -> int: if token in self.special_tokens: return self.special_tokens[token] if token in self.mergeable_ranks: return self.mergeable_ranks[token] return 100256 def _decode( self, token_ids: Union[int, List[int]], skip_special_tokens: bool = False, errors: str = None, **kwargs, ) -> str: if isinstance(token_ids, int): token_ids = [token_ids] if skip_special_tokens: token_ids = [i for i in token_ids if i < self.eod_id] return self.tokenizer.decode(token_ids, errors=errors or self.errors) def _tokenize(self, text: str, **kwargs): raise NotImplementedError def convert_tokens_to_ids(self, tokens: Union[bytes, str, List[Union[bytes, str]]]) -> Union[int, List[int]]: if isinstance(tokens, (str, bytes)): if tokens in self.special_tokens: return self.special_tokens[tokens] return self.mergeable_ranks.get(tokens) ids = [] for token in tokens: ids.append(self.convert_tokens_to_ids(token)) return ids def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str: text = "" temp = b"" for t in tokens: if isinstance(t, str): if temp: text += temp.decode("utf-8", errors=self.errors) temp = b"" text += t elif isinstance(t, bytes): temp += t else: raise TypeError("token should only be of type types or str") if temp: text += temp.decode("utf-8", errors=self.errors) return text def get_vocab(self) -> Dict[Union[str, bytes], int]: return {**self.mergeable_ranks, **self.special_tokens} def save_vocabulary(self, save_directory: str, **kwargs) -> Tuple[str]: file_path = os.path.join(save_directory, "cl100k_base.tiktoken") with open(file_path, "w") as f: for token, rank in self.mergeable_ranks.items(): line = base64.b64encode(token).decode("utf-8") + " " + str(rank) + "\n" f.write(line) return (file_path,) def tokenize( self, text: str, allowed_special: Union[Set, str] = "all", disallowed_special: Union[Collection, str] = (), **kwargs, ) -> List[Union[bytes, str]]: text = self._strip_special_tokens(text) return [ self.decoder[token_id] for token_id in self.tokenizer.encode(text, allowed_special=allowed_special, disallowed_special=disallowed_special) ]