Spaces:
Running
Running
import json | |
import numpy as np | |
import torch | |
import torchaudio | |
from torchaudio.transforms import Resample | |
from huggingface_hub import hf_hub_download | |
import gradio as gr | |
from pipeline import PreTrainedPipeline | |
HF_HUB_URL = 'ales/wav2vec2-cv-be' | |
LM_HUB_FP = 'language_model/cv8be_5gram.bin' | |
def main(audio_fp: str): | |
audio, sampling_rate = torchaudio.load(audio_fp, normalize=True) | |
# convert stereo to mono | |
converted_to_mono = False | |
if audio.shape[0] > 1: | |
audio = torch.mean(audio, dim=0, keepdim=True) | |
converted_to_mono = True | |
# resample audio to 16kHz | |
resampler = Resample(orig_freq=sampling_rate, new_freq=16_000) | |
audio_resampled = resampler(audio) | |
inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline | |
# download Language Model from HF Hub | |
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP) | |
# init pipeline | |
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp) | |
# recognize speech | |
res = pipeline(inputs=inputs) | |
# text_recognized = res['text'][0] | |
res['sampling_rate_orig'] = sampling_rate | |
res['init_audio_shape'] = audio.shape | |
res['converted_to_mono'] = converted_to_mono | |
res['inputs_shape'] = inputs.shape | |
res['inputs_max'] = np.max(inputs).item() | |
res['inputs_min'] = np.min(inputs).item() | |
res_str = json.dumps(res, indent=2) | |
return res_str | |
iface = gr.Interface( | |
fn=main, | |
inputs=gr.inputs.Audio( | |
source='microphone', type='filepath', | |
label='Запішыце аўдыяфайл, каб распазнаваць маўленне' | |
), | |
outputs='text' | |
) | |
iface.launch() | |