Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) 2023 Wenet Community. (authors: Dinghao Zhou) | |
# (authors: Xingchen Song) | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
import logging | |
from wenet.text.base_tokenizer import BaseTokenizer | |
from wenet.text.bpe_tokenizer import BpeTokenizer | |
from wenet.text.char_tokenizer import CharTokenizer | |
from wenet.text.hugging_face_tokenizer import HuggingFaceTokenizer | |
from wenet.text.paraformer_tokenizer import ParaformerTokenizer | |
from wenet.text.whisper_tokenizer import WhisperTokenizer | |
def init_tokenizer(configs) -> BaseTokenizer: | |
# TODO(xcsong): Forcefully read the 'tokenizer' attribute. | |
tokenizer_type = configs.get("tokenizer", "char") | |
if tokenizer_type == "whisper": | |
tokenizer = WhisperTokenizer( | |
multilingual=configs['tokenizer_conf']['is_multilingual'], | |
num_languages=configs['tokenizer_conf']['num_languages']) | |
elif tokenizer_type == "char": | |
tokenizer = CharTokenizer( | |
configs['tokenizer_conf']['symbol_table_path'], | |
configs['tokenizer_conf']['non_lang_syms_path'], | |
split_with_space=configs['tokenizer_conf'].get( | |
'split_with_space', False), | |
connect_symbol=configs['tokenizer_conf'].get('connect_symbol', '')) | |
elif tokenizer_type == "bpe": | |
tokenizer = BpeTokenizer( | |
configs['tokenizer_conf']['bpe_path'], | |
configs['tokenizer_conf']['symbol_table_path'], | |
configs['tokenizer_conf']['non_lang_syms_path'], | |
split_with_space=configs['tokenizer_conf'].get( | |
'split_with_space', False)) | |
elif tokenizer_type == 'paraformer': | |
tokenizer = ParaformerTokenizer( | |
symbol_table=configs['tokenizer_conf']['symbol_table_path'], | |
seg_dict=configs['tokenizer_conf']['seg_dict_path']) | |
elif tokenizer_type == 'huggingface': | |
tokenizer = HuggingFaceTokenizer( | |
model=configs['tokenizer_conf']['llm_path']) | |
else: | |
raise NotImplementedError | |
logging.info("use {} tokenizer".format(configs["tokenizer"])) | |
return tokenizer | |