import tensorflow as tf gpus = tf.config.list_physical_devices('GPU') tf.config.set_visible_devices(gpus[0:1], 'GPU') from vocab.vocab import Vocab from dataset import create_dataset from configs.config import Config import sys from featurizers.speech_featurizers import TFSpeechFeaturizer, NumpySpeechFeaturizer from models.model import MulSpeechLR as Model import librosa weights_dir = './saved_weights/20230228-084356/' config_file = weights_dir + 'config.yml' model_file = weights_dir + 'last/model' vocab_file = weights_dir + 'vocab.txt' config = Config(config_file) speech_featurizer = TFSpeechFeaturizer(config.speech_config) lr_vocab = Vocab(vocab_file) lr_model = Model(**config.model_config, vocab_size=len(lr_vocab.token_list)) lr_model.load_weights(model_file) lr_model.add_featurizers(speech_featurizer) lr_model.init_build([None, config.speech_config['num_feature_bins']]) lr_model.summary() def predict_wav(wav_path): sample_rate = 16000 signal, _ = librosa.load(wav_path, sr=sample_rate) predict, prob = lr_model.predict_pb(signal) language = lr_vocab.token_list[predict.numpy()] print("predict language={} prob={:.4f}".format(language, prob.numpy()*100)) if __name__ == '__main__': wav_path = sys.argv[1] predict_wav(wav_path)