|
from transformers import Wav2Vec2ProcessorWithLM |
|
import torchaudio |
|
|
|
import torch |
|
from datasets import load_dataset |
|
from transformers import AutoModelForCTC, AutoProcessor |
|
import torchaudio.functional as F |
|
|
|
|
|
|
|
model_id = "." |
|
|
|
|
|
sample_iter = iter(load_dataset("mozilla-foundation/common_voice_7_0", "sv-SE", split="test", streaming=True, use_auth_token=True)) |
|
|
|
sample = next(sample_iter) |
|
resampled_audio = F.resample(torch.tensor(sample["audio"]["array"]), 48_000, 16_000).numpy() |
|
|
|
model = AutoModelForCTC.from_pretrained(model_id) |
|
processor = AutoProcessor.from_pretrained(model_id) |
|
|
|
input_values = processor(resampled_audio, return_tensors="pt").input_values |
|
|
|
with torch.no_grad(): |
|
logits = model(input_values).logits |
|
transcription = processor.batch_decode(logits.numpy()).text |
|
print(transcription) |