OSUM / wenet /text /hugging_face_tokenizer.py
tomxxie
适配zeroGPU
568e264
raw
history blame
1.84 kB
from os import PathLike
from typing import Dict, List, Union
from wenet.text.base_tokenizer import BaseTokenizer, T as Type
class HuggingFaceTokenizer(BaseTokenizer):
def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None:
# NOTE(Mddct): don't build here, pickle issues
self.model = model
self.tokenizer = None
self.args = args
self.kwargs = kwargs
def __getstate__(self):
state = self.__dict__.copy()
del state['tokenizer']
return state
def __setstate__(self, state):
self.__dict__.update(state)
recovery = {'tokenizer': None}
self.__dict__.update(recovery)
def _build_hugging_face(self):
from transformers import AutoTokenizer
if self.tokenizer is None:
self.tokenizer = AutoTokenizer.from_pretrained(
self.model, **self.kwargs)
self.t2i = self.tokenizer.get_vocab()
def text2tokens(self, line: str) -> List[Type]:
self._build_hugging_face()
return self.tokenizer.tokenize(line)
def tokens2text(self, tokens: List[Type]) -> str:
self._build_hugging_face()
ids = self.tokens2ids(tokens)
return self.tokenizer.decode(ids)
def tokens2ids(self, tokens: List[Type]) -> List[int]:
self._build_hugging_face()
return self.tokenizer.convert_tokens_to_ids(tokens)
def ids2tokens(self, ids: List[int]) -> List[Type]:
self._build_hugging_face()
return self.tokenizer.convert_ids_to_tokens(ids)
def vocab_size(self) -> int:
self._build_hugging_face()
# TODO: we need special tokenize size in future
return len(self.tokenizer)
@property
def symbol_table(self) -> Dict[Type, int]:
self._build_hugging_face()
return self.t2i