import torch from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer import torchaudio def speech_recognition(audio_file_path): tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") waveform, sample_rate = torchaudio.load(audio_file_path) if sample_rate != 16000: resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) waveform = resampler(waveform) input_values = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", padding="longest").input_values with torch.no_grad(): logits = model(input_values).logits predicted_ids = torch.argmax(logits, dim=-1) transcription = tokenizer.batch_decode(predicted_ids) return transcription[0]