Spaces:
Runtime error
Runtime error
changed the way input audio is processed
Browse files
app.py
CHANGED
@@ -3,6 +3,7 @@ from typing import Tuple
|
|
3 |
import numpy as np
|
4 |
|
5 |
import torch
|
|
|
6 |
from torchaudio.transforms import Resample
|
7 |
|
8 |
from huggingface_hub import hf_hub_download
|
@@ -16,12 +17,13 @@ HF_HUB_URL = 'ales/wav2vec2-cv-be'
|
|
16 |
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
|
17 |
|
18 |
|
19 |
-
def main(
|
20 |
-
|
21 |
|
22 |
# resample audio to 16kHz
|
23 |
resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
|
24 |
-
audio_resampled = resampler(
|
|
|
25 |
|
26 |
# download Language Model from HF Hub
|
27 |
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
|
@@ -30,15 +32,18 @@ def main(rate_audio_tuple: Tuple[int, np.ndarray]):
|
|
30 |
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
|
31 |
|
32 |
# recognize speech
|
33 |
-
text_recognized = pipeline(inputs=
|
34 |
|
35 |
return text_recognized
|
36 |
|
37 |
|
38 |
iface = gr.Interface(
|
39 |
fn=main,
|
40 |
-
inputs=
|
41 |
-
|
|
|
|
|
|
|
42 |
)
|
43 |
|
44 |
iface.launch()
|
|
|
3 |
import numpy as np
|
4 |
|
5 |
import torch
|
6 |
+
import torchaudio
|
7 |
from torchaudio.transforms import Resample
|
8 |
|
9 |
from huggingface_hub import hf_hub_download
|
|
|
17 |
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
|
18 |
|
19 |
|
20 |
+
def main(audio_fp: str):
|
21 |
+
audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
|
22 |
|
23 |
# resample audio to 16kHz
|
24 |
resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
|
25 |
+
audio_resampled = resampler(audio)
|
26 |
+
inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline
|
27 |
|
28 |
# download Language Model from HF Hub
|
29 |
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
|
|
|
32 |
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
|
33 |
|
34 |
# recognize speech
|
35 |
+
text_recognized = pipeline(inputs=inputs)['text'][0]
|
36 |
|
37 |
return text_recognized
|
38 |
|
39 |
|
40 |
iface = gr.Interface(
|
41 |
fn=main,
|
42 |
+
inputs=gr.inputs.Audio(
|
43 |
+
source='microphone', type='filepath',
|
44 |
+
label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
|
45 |
+
),
|
46 |
+
outputs='text'
|
47 |
)
|
48 |
|
49 |
iface.launch()
|