File size: 1,694 Bytes
d71b5df
feb2a2b
 
 
 
ded23d4
feb2a2b
 
 
 
 
 
 
 
 
 
 
 
 
ded23d4
 
feb2a2b
d71b5df
 
 
 
 
 
feb2a2b
 
ded23d4
 
feb2a2b
 
 
 
 
 
 
 
3702096
 
feb2a2b
3702096
 
d71b5df
3702096
 
 
 
d71b5df
 
 
feb2a2b
 
 
 
ded23d4
d71b5df
ded23d4
 
 
feb2a2b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import json

import numpy as np

import torch
import torchaudio
from torchaudio.transforms import Resample

from huggingface_hub import hf_hub_download

import gradio as gr

from pipeline import PreTrainedPipeline


HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'


def main(audio_fp: str):
    audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)

    # convert stereo to mono
    converted_to_mono = False
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
        converted_to_mono = True

    # resample audio to 16kHz
    resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
    audio_resampled = resampler(audio)
    inputs = audio_resampled.numpy().flatten()  # cast to numpy as expected by the pipeline

    # download Language Model from HF Hub
    lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

    # init pipeline
    pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)

    # recognize speech
    res = pipeline(inputs=inputs)
    # text_recognized = res['text'][0]

    res['sampling_rate_orig'] = sampling_rate
    res['init_audio_shape'] = audio.shape
    res['converted_to_mono'] = converted_to_mono
    res['inputs_shape'] = inputs.shape
    res['inputs_max'] = np.max(inputs).item()
    res['inputs_min'] = np.min(inputs).item()

    res_str = json.dumps(res, indent=2)

    return res_str


iface = gr.Interface(
    fn=main,
    inputs=gr.inputs.Audio(
        source='microphone', type='filepath',
        label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
    ),
    outputs='text'
)

iface.launch()