Spaces:

ales
/

wav2vec2-cv-be-lm

Runtime error

File size: 1,694 Bytes

d71b5df
feb2a2b
 
 
 
ded23d4
feb2a2b
 
 
 
 
 
 
 
 
 
 
 
 
ded23d4
 
feb2a2b
d71b5df
 
 
 
 
 
feb2a2b
 
ded23d4
 
feb2a2b
 
 
 
 
 
 
 
3702096
 
feb2a2b
3702096
 
d71b5df
3702096
 
 
 
d71b5df
 
 
feb2a2b
 
 
 
ded23d4
d71b5df
ded23d4
 
 
feb2a2b

import json

import numpy as np

import torch
import torchaudio
from torchaudio.transforms import Resample

from huggingface_hub import hf_hub_download

import gradio as gr

from pipeline import PreTrainedPipeline


HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'


def main(audio_fp: str):
    audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)

    # convert stereo to mono
    converted_to_mono = False
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
        converted_to_mono = True

    # resample audio to 16kHz
    resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
    audio_resampled = resampler(audio)
    inputs = audio_resampled.numpy().flatten()  # cast to numpy as expected by the pipeline

    # download Language Model from HF Hub
    lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

    # init pipeline
    pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)

    # recognize speech
    res = pipeline(inputs=inputs)
    # text_recognized = res['text'][0]

    res['sampling_rate_orig'] = sampling_rate
    res['init_audio_shape'] = audio.shape
    res['converted_to_mono'] = converted_to_mono
    res['inputs_shape'] = inputs.shape
    res['inputs_max'] = np.max(inputs).item()
    res['inputs_min'] = np.min(inputs).item()

    res_str = json.dumps(res, indent=2)

    return res_str


iface = gr.Interface(
    fn=main,
    inputs=gr.inputs.Audio(
        source='microphone', type='filepath',
        label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
    ),
    outputs='text'
)

iface.launch()