Spaces:
Running
on
Zero
Running
on
Zero
File size: 1,842 Bytes
568e264 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
from os import PathLike
from typing import Dict, List, Union
from wenet.text.base_tokenizer import BaseTokenizer, T as Type
class HuggingFaceTokenizer(BaseTokenizer):
def __init__(self, model: Union[str, PathLike], *args, **kwargs) -> None:
# NOTE(Mddct): don't build here, pickle issues
self.model = model
self.tokenizer = None
self.args = args
self.kwargs = kwargs
def __getstate__(self):
state = self.__dict__.copy()
del state['tokenizer']
return state
def __setstate__(self, state):
self.__dict__.update(state)
recovery = {'tokenizer': None}
self.__dict__.update(recovery)
def _build_hugging_face(self):
from transformers import AutoTokenizer
if self.tokenizer is None:
self.tokenizer = AutoTokenizer.from_pretrained(
self.model, **self.kwargs)
self.t2i = self.tokenizer.get_vocab()
def text2tokens(self, line: str) -> List[Type]:
self._build_hugging_face()
return self.tokenizer.tokenize(line)
def tokens2text(self, tokens: List[Type]) -> str:
self._build_hugging_face()
ids = self.tokens2ids(tokens)
return self.tokenizer.decode(ids)
def tokens2ids(self, tokens: List[Type]) -> List[int]:
self._build_hugging_face()
return self.tokenizer.convert_tokens_to_ids(tokens)
def ids2tokens(self, ids: List[int]) -> List[Type]:
self._build_hugging_face()
return self.tokenizer.convert_ids_to_tokens(ids)
def vocab_size(self) -> int:
self._build_hugging_face()
# TODO: we need special tokenize size in future
return len(self.tokenizer)
@property
def symbol_table(self) -> Dict[Type, int]:
self._build_hugging_face()
return self.t2i
|