Spaces:

ales
/

wav2vec2-cv-be-lm

Running

File size: 2,479 Bytes

aca9f3d
feb2a2b
 
 
 
ded23d4
feb2a2b
 
 
 
 
 
 
 
 
 
 
 
 
ded23d4
 
aca9f3d
feb2a2b
d71b5df
 
 
 
 
 
feb2a2b
 
ded23d4
 
feb2a2b
 
 
 
 
 
 
 
44daa8d
 
feb2a2b
44daa8d
 
 
 
 
 
 
 
 
 
3702096
44daa8d
d71b5df
44daa8d
feb2a2b
 
 
 
ded23d4
d71b5df
1022fd5
ded23d4
44daa8d
1022fd5
 
44daa8d
cff8d27
 
 
1022fd5
feb2a2b

from pprint import pformat

import numpy as np

import torch
import torchaudio
from torchaudio.transforms import Resample

from huggingface_hub import hf_hub_download

import gradio as gr

from pipeline import PreTrainedPipeline


HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'


def main(audio_fp: str):
    audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
    init_audio_shape = audio.shape

    # convert stereo to mono
    converted_to_mono = False
    if audio.shape[0] > 1:
        audio = torch.mean(audio, dim=0, keepdim=True)
        converted_to_mono = True

    # resample audio to 16kHz
    resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
    audio_resampled = resampler(audio)
    inputs = audio_resampled.numpy().flatten()  # cast to numpy as expected by the pipeline

    # download Language Model from HF Hub
    lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

    # init pipeline
    pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)

    # recognize speech
    pipeline_res = pipeline(inputs=inputs)
    text = pipeline_res['text'][0]  # unpack batch of size 1

    # add technical information to the output
    tech_data = pipeline_res
    del tech_data['text']
    tech_data['sampling_rate_orig'] = sampling_rate
    tech_data['init_audio_shape'] = init_audio_shape
    tech_data['converted_to_mono'] = converted_to_mono
    tech_data['resampled_audio_shape'] = audio_resampled.shape
    tech_data['inputs_shape'] = inputs.shape
    tech_data['inputs_max'] = np.max(inputs).item()
    tech_data['inputs_min'] = np.min(inputs).item()

    tech_data_str = pformat(tech_data)

    return text, tech_data_str


iface = gr.Interface(
    fn=main,
    inputs=gr.inputs.Audio(
        source='microphone', type='filepath',
        label='Запішыце аўдыяфайл, каб распазнаць маўленьне'
    ),
    outputs=[
        gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
        gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
    ],
    title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
    description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
                'Акустычная мадэль + моўная мадэль.'
    ),
)

iface.launch()