OSUM / wenet /text /bpe_tokenizer.py
tomxxie
适配zeroGPU
568e264
from os import PathLike
from typing import Dict, List, Optional, Union
from wenet.text.char_tokenizer import CharTokenizer
from wenet.text.tokenize_utils import tokenize_by_bpe_model
class BpeTokenizer(CharTokenizer):
def __init__(
self,
bpe_model: Union[PathLike, str],
symbol_table: Union[str, PathLike, Dict],
non_lang_syms: Optional[Union[str, PathLike, List]] = None,
split_with_space: bool = False,
connect_symbol: str = '',
unk='<unk>',
) -> None:
super().__init__(symbol_table, non_lang_syms, split_with_space,
connect_symbol, unk)
self._model = bpe_model
# NOTE(Mddct): multiprocessing.Process() issues
# don't build sp here
self.bpe_model = None
def _build_sp(self):
if self.bpe_model is None:
import sentencepiece as spm
self.bpe_model = spm.SentencePieceProcessor()
self.bpe_model.load(self._model)
def text2tokens(self, line: str) -> List[str]:
self._build_sp()
line = line.strip()
if self.non_lang_syms_pattern is not None:
parts = self.non_lang_syms_pattern.split(line.upper())
parts = [w for w in parts if len(w.strip()) > 0]
else:
parts = [line]
tokens = []
for part in parts:
if part in self.non_lang_syms:
tokens.append(part)
else:
tokens.extend(tokenize_by_bpe_model(self.bpe_model, part))
return tokens
def tokens2text(self, tokens: List[str]) -> str:
self._build_sp()
text = super().tokens2text(tokens)
return text.replace("▁", ' ').strip()