ales commited on
Commit
ded23d4
·
1 Parent(s): 7bf07e0

changed the way input audio is processed

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -3,6 +3,7 @@ from typing import Tuple
3
  import numpy as np
4
 
5
  import torch
 
6
  from torchaudio.transforms import Resample
7
 
8
  from huggingface_hub import hf_hub_download
@@ -16,12 +17,13 @@ HF_HUB_URL = 'ales/wav2vec2-cv-be'
16
  LM_HUB_FP = 'language_model/cv8be_5gram.bin'
17
 
18
 
19
- def main(rate_audio_tuple: Tuple[int, np.ndarray]):
20
- sampling_rate, audio = rate_audio_tuple
21
 
22
  # resample audio to 16kHz
23
  resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
24
- audio_resampled = resampler(torch.tensor(audio)).numpy().flatten()
 
25
 
26
  # download Language Model from HF Hub
27
  lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
@@ -30,15 +32,18 @@ def main(rate_audio_tuple: Tuple[int, np.ndarray]):
30
  pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
31
 
32
  # recognize speech
33
- text_recognized = pipeline(inputs=audio_resampled)['text'][0]
34
 
35
  return text_recognized
36
 
37
 
38
  iface = gr.Interface(
39
  fn=main,
40
- inputs='microphone',
41
- outputs="text"
 
 
 
42
  )
43
 
44
  iface.launch()
 
3
  import numpy as np
4
 
5
  import torch
6
+ import torchaudio
7
  from torchaudio.transforms import Resample
8
 
9
  from huggingface_hub import hf_hub_download
 
17
  LM_HUB_FP = 'language_model/cv8be_5gram.bin'
18
 
19
 
20
+ def main(audio_fp: str):
21
+ audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
22
 
23
  # resample audio to 16kHz
24
  resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
25
+ audio_resampled = resampler(audio)
26
+ inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline
27
 
28
  # download Language Model from HF Hub
29
  lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
 
32
  pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
33
 
34
  # recognize speech
35
+ text_recognized = pipeline(inputs=inputs)['text'][0]
36
 
37
  return text_recognized
38
 
39
 
40
  iface = gr.Interface(
41
  fn=main,
42
+ inputs=gr.inputs.Audio(
43
+ source='microphone', type='filepath',
44
+ label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
45
+ ),
46
+ outputs='text'
47
  )
48
 
49
  iface.launch()