AI_Application / app.py
Antoniskaraolis's picture
Create app.py
f3c7107
raw
history blame
825 Bytes
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import torchaudio
def speech_recognition(audio_file_path):
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
waveform, sample_rate = torchaudio.load(audio_file_path)
if sample_rate != 16000:
resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
waveform = resampler(waveform)
input_values = tokenizer(waveform.squeeze().numpy(), return_tensors="pt", padding="longest").input_values
with torch.no_grad():
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcription = tokenizer.batch_decode(predicted_ids)
return transcription[0]