Spaces:
Running
on
Zero
Running
on
Zero
from os import PathLike | |
from typing import Dict, List, Union | |
from wenet.text.base_tokenizer import BaseTokenizer, T as Type | |
class HuggingFaceTokenizer(BaseTokenizer): | |
def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None: | |
# NOTE(Mddct): don't build here, pickle issues | |
self.model = model | |
self.tokenizer = None | |
self.args = args | |
self.kwargs = kwargs | |
def __getstate__(self): | |
state = self.__dict__.copy() | |
del state['tokenizer'] | |
return state | |
def __setstate__(self, state): | |
self.__dict__.update(state) | |
recovery = {'tokenizer': None} | |
self.__dict__.update(recovery) | |
def _build_hugging_face(self): | |
from transformers import AutoTokenizer | |
if self.tokenizer is None: | |
self.tokenizer = AutoTokenizer.from_pretrained( | |
self.model, **self.kwargs) | |
self.t2i = self.tokenizer.get_vocab() | |
def text2tokens(self, line: str) -> List[Type]: | |
self._build_hugging_face() | |
return self.tokenizer.tokenize(line) | |
def tokens2text(self, tokens: List[Type]) -> str: | |
self._build_hugging_face() | |
ids = self.tokens2ids(tokens) | |
return self.tokenizer.decode(ids) | |
def tokens2ids(self, tokens: List[Type]) -> List[int]: | |
self._build_hugging_face() | |
return self.tokenizer.convert_tokens_to_ids(tokens) | |
def ids2tokens(self, ids: List[int]) -> List[Type]: | |
self._build_hugging_face() | |
return self.tokenizer.convert_ids_to_tokens(ids) | |
def vocab_size(self) -> int: | |
self._build_hugging_face() | |
# TODO: we need special tokenize size in future | |
return len(self.tokenizer) | |
def symbol_table(self) -> Dict[Type, int]: | |
self._build_hugging_face() | |
return self.t2i | |