import argparse import os import numpy as np import librosa import torch from tqdm import tqdm from transformers import Wav2Vec2FeatureExtractor, HubertModel def main(args): if not torch.cuda.is_available() and args.computed_device == 'cuda': print('CUDA is not available on this device. Switching to CPU.') args.computed_device = 'cpu' device = torch.device(args.computed_device) model = HubertModel.from_pretrained(args.model_path).to(device) feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(args.model_path) model.feature_extractor._freeze_parameters() model.eval() os.makedirs(args.audio_feature_saved_path, exist_ok=True) for wavfile in tqdm(os.listdir(args.audio_dir_path)): npy_save_path = os.path.join(args.audio_feature_saved_path, os.path.splitext(os.path.basename(wavfile))[0] + '.npy') if os.path.exists(npy_save_path): continue audio, sr = librosa.load(os.path.join(args.audio_dir_path, wavfile), sr=16000) input_values = feature_extractor(audio, sampling_rate=16000, padding=True, do_normalize=True, return_tensors="pt").input_values input_values = input_values.to(device) ws_feats = [] with torch.no_grad(): outputs = model(input_values, output_hidden_states=True) for i in range(len(outputs.hidden_states)): ws_feats.append(outputs.hidden_states[i].detach().cpu().numpy()) ws_feat_obj = np.array(ws_feats) ws_feat_obj = np.squeeze(ws_feat_obj, 1) if args.padding_to_align_audio: ws_feat_obj = np.pad(ws_feat_obj, ((0, 0), (0, 1), (0, 0)), 'edge') np.save(npy_save_path, ws_feat_obj) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Extract audio features using a pre-trained HuBERT model.") parser.add_argument("--model_path", type=str, default='weights/chinese-hubert-large', help="Path to the pre-trained model weights.") parser.add_argument("--audio_dir_path", type=str, default='./audio_samples/raw_audios/', help="Directory containing raw audio files.") parser.add_argument("--audio_feature_saved_path", type=str, default='./audio_samples/audio_features/', help="Directory where extracted audio features will be saved.") parser.add_argument("--computed_device", type=str, default='cuda', choices=['cuda', 'cpu'], help="Device to compute the audio features on. Use 'cuda' for GPU or 'cpu' for CPU.") parser.add_argument("--padding_to_align_audio", type=bool, default=True, help="Whether to pad the audio to align features.") args = parser.parse_args() main(args)