Spaces:
Sleeping
Sleeping
import torch | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer | |
import torchaudio | |
def speech_recognition(audio_file_path): | |
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h") | |
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") | |
waveform, sample_rate = torchaudio.load(audio_file_path) | |
if sample_rate != 16000: | |
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000) | |
waveform = resampler(waveform) | |
input_values = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", padding="longest").input_values | |
with torch.no_grad(): | |
logits = model(input_values).logits | |
predicted_ids = torch.argmax(logits, dim=-1) | |
transcription = tokenizer.batch_decode(predicted_ids) | |
return transcription[0] |