# Copyright (c) 2023 Wenet Community. (authors: Dinghao Zhou) # (authors: Xingchen Song) # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import logging from wenet.text.base_tokenizer import BaseTokenizer from wenet.text.bpe_tokenizer import BpeTokenizer from wenet.text.char_tokenizer import CharTokenizer from wenet.text.hugging_face_tokenizer import HuggingFaceTokenizer from wenet.text.paraformer_tokenizer import ParaformerTokenizer from wenet.text.whisper_tokenizer import WhisperTokenizer def init_tokenizer(configs) -> BaseTokenizer: # TODO(xcsong): Forcefully read the 'tokenizer' attribute. tokenizer_type = configs.get("tokenizer", "char") if tokenizer_type == "whisper": tokenizer = WhisperTokenizer( multilingual=configs['tokenizer_conf']['is_multilingual'], num_languages=configs['tokenizer_conf']['num_languages']) elif tokenizer_type == "char": tokenizer = CharTokenizer( configs['tokenizer_conf']['symbol_table_path'], configs['tokenizer_conf']['non_lang_syms_path'], split_with_space=configs['tokenizer_conf'].get( 'split_with_space', False), connect_symbol=configs['tokenizer_conf'].get('connect_symbol', '')) elif tokenizer_type == "bpe": tokenizer = BpeTokenizer( configs['tokenizer_conf']['bpe_path'], configs['tokenizer_conf']['symbol_table_path'], configs['tokenizer_conf']['non_lang_syms_path'], split_with_space=configs['tokenizer_conf'].get( 'split_with_space', False)) elif tokenizer_type == 'paraformer': tokenizer = ParaformerTokenizer( symbol_table=configs['tokenizer_conf']['symbol_table_path'], seg_dict=configs['tokenizer_conf']['seg_dict_path']) elif tokenizer_type == 'huggingface': tokenizer = HuggingFaceTokenizer( model=configs['tokenizer_conf']['llm_path']) else: raise NotImplementedError logging.info("use {} tokenizer".format(configs["tokenizer"])) return tokenizer