|
--- |
|
license: cc-by-4.0 |
|
datasets: |
|
- nwu-ctext/nchlt |
|
language: |
|
- afr |
|
- eng |
|
- nbl |
|
- nso |
|
- sot |
|
- ssw |
|
- tsn |
|
- tso |
|
- ven |
|
- xho |
|
- zul |
|
base_model: facebook/mms-1b-all |
|
pipeline_tag: automatic-speech-recognition |
|
--- |
|
|
|
Inference Example |
|
```python |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC |
|
import torch |
|
import librosa |
|
import os |
|
|
|
model_name = "guymandude/MMS-ASR-ZA-11" |
|
|
|
def load_audio_file(path): |
|
audio_array, sampling_rate = librosa.load(path, sr=None) |
|
return {"array": audio_array, "sampling_rate": sampling_rate} |
|
|
|
model = Wav2Vec2ForCTC.from_pretrained(model_name,ignore_mismatched_sizes=True).to("cuda") |
|
processor = Wav2Vec2Processor.from_pretrained(model_name) |
|
# change to supported languages [eng, afr, sot, zul, xho, nso, nbl, tso, tsn, ven, ssw] |
|
model.load_adapter("tsn") |
|
processor.tokenizer.set_target_lang("tsn") |
|
|
|
audio = load_audio_file("<AUDIO PATH>") |
|
|
|
input_dict = processor(audio["array"], sampling_rate=16_000, return_tensors="pt", padding=True) |
|
|
|
logits = model(input_dict.input_values.to("cuda")).logits |
|
|
|
pred_ids = torch.argmax(logits, dim=-1)[0] |
|
|
|
print(processor.decode(pred_ids)) |
|
``` |