classla
/

wav2vec2-large-slavic-parlaspeech-hr-lm

Automatic Speech Recognition

Model card Files Files and versions

5roop commited on Apr 29, 2022

Commit

ec1ce66

·

1 Parent(s): 15b4ff5

Update README.md

Correct the use example.

Files changed (1) hide show

README.md +12 -11

README.md CHANGED Viewed

@@ -38,28 +38,29 @@ Nikola Ljubešić, Danijel Koržinek, Peter Rupnik, Ivo-Pavao Jazbec. ParlaSpeec
 So far untested use from before:
 ```python
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 import soundfile as sf
 import torch
 import os
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # load model and tokenizer
-processor = Wav2Vec2Processor.from_pretrained(
-    "classla/wav2vec2-large-slavic-parlaspeech-hr")
-model = Wav2Vec2ForCTC.from_pretrained("classla/wav2vec2-large-slavic-parlaspeech-hr")
 # download the example wav files:
 os.system("wget https://huggingface.co/classla/wav2vec2-large-slavic-parlaspeech-hr/raw/main/00020570a.flac.wav")
 # read the wav file
 speech, sample_rate = sf.read("00020570a.flac.wav")
-input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.to(device)
 # remove the raw wav file
 os.system("rm 00020570a.flac.wav")
-# retrieve logits
-logits = model.to(device)(input_values).logits
-# take argmax and decode
-predicted_ids = torch.argmax(logits, dim=-1)
-transcription = processor.decode(predicted_ids[0]).lower()
-# transcription: 'veliki broj poslovnih subjekata posluje sa minusom velik dio'
 ```

 So far untested use from before:
 ```python
+from transformers import Wav2Vec2ProcessorWithLM, Wav2Vec2ForCTC
 import soundfile as sf
 import torch
 import os
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # load model and tokenizer
+processor = Wav2Vec2ProcessorWithLM.from_pretrained(
+    "5roop/wav2vec2-large-slavic-parlaspeech-hr-lm")
+model = Wav2Vec2ForCTC.from_pretrained("5roop/wav2vec2-large-slavic-parlaspeech-hr-lm")
 # download the example wav files:
 os.system("wget https://huggingface.co/classla/wav2vec2-large-slavic-parlaspeech-hr/raw/main/00020570a.flac.wav")
 # read the wav file
 speech, sample_rate = sf.read("00020570a.flac.wav")
+input_values = processor(speech, sampling_rate=sample_rate, return_tensors="pt").input_values.cuda()
+inputs = processor(speech, sampling_rate=sample_rate, return_tensors="pt")
+with torch.no_grad():
+    logits = model(**inputs).logits
+transcription = processor.batch_decode(logits.numpy()).text[0]
 # remove the raw wav file
 os.system("rm 00020570a.flac.wav")
+transcription # 'velik broj poslovnih subjekata poslao je sa minusom velik dio'
 ```