Spaces:

ales
/

wav2vec2-cv-be-lm

Runtime error

ales commited on Apr 13, 2022

Commit

ded23d4

1 Parent(s): 7bf07e0

changed the way input audio is processed

Files changed (1) hide show

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from typing import Tuple
 import numpy as np
 import torch
 from torchaudio.transforms import Resample
 from huggingface_hub import hf_hub_download
@@ -16,12 +17,13 @@ HF_HUB_URL = 'ales/wav2vec2-cv-be'
 LM_HUB_FP = 'language_model/cv8be_5gram.bin'
-def main(rate_audio_tuple: Tuple[int, np.ndarray]):
-    sampling_rate, audio = rate_audio_tuple
     # resample audio to 16kHz
     resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
-    audio_resampled = resampler(torch.tensor(audio)).numpy().flatten()
     # download Language Model from HF Hub
     lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
@@ -30,15 +32,18 @@ def main(rate_audio_tuple: Tuple[int, np.ndarray]):
     pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
     # recognize speech
-    text_recognized = pipeline(inputs=audio_resampled)['text'][0]
     return text_recognized
 iface = gr.Interface(
     fn=main,
-    inputs='microphone',
-    outputs="text"
 )
 iface.launch()

 import numpy as np
 import torch
+import torchaudio
 from torchaudio.transforms import Resample
 from huggingface_hub import hf_hub_download
 LM_HUB_FP = 'language_model/cv8be_5gram.bin'
+def main(audio_fp: str):
+    audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
     # resample audio to 16kHz
     resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
+    audio_resampled = resampler(audio)
+    inputs = audio_resampled.numpy().flatten()  # cast to numpy as expected by the pipeline
     # download Language Model from HF Hub
     lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
     pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
     # recognize speech
+    text_recognized = pipeline(inputs=inputs)['text'][0]
     return text_recognized
 iface = gr.Interface(
     fn=main,
+    inputs=gr.inputs.Audio(
+        source='microphone', type='filepath',
+        label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
+    ),
+    outputs='text'
 )
 iface.launch()