import torch from transformers import WhisperProcessor, WhisperForConditionalGeneration def get_whisper_encoder(): processor = WhisperProcessor.from_pretrained("openai/whisper-large-v3") model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v3").model.encoder return processor, model.eval() if __name__=="__main__": import numpy as np processor, model = get_whisper_encoder() model = model.cuda() with torch.no_grad(): input_features = processor(np.random.rand(16000*30,), sampling_rate=16000, return_tensors="pt").input_features.cuda() print(input_features.shape) out = model(input_features.repeat(10,1,1)) import pdb;pdb.set_trace() print(list(out.values())[0].shape)