Spaces:
Running
Running
File size: 2,479 Bytes
aca9f3d feb2a2b ded23d4 feb2a2b ded23d4 aca9f3d feb2a2b d71b5df feb2a2b ded23d4 feb2a2b 44daa8d feb2a2b 44daa8d 3702096 44daa8d d71b5df 44daa8d feb2a2b ded23d4 d71b5df 1022fd5 ded23d4 44daa8d 1022fd5 44daa8d cff8d27 1022fd5 feb2a2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
from pprint import pformat
import numpy as np
import torch
import torchaudio
from torchaudio.transforms import Resample
from huggingface_hub import hf_hub_download
import gradio as gr
from pipeline import PreTrainedPipeline
HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
def main(audio_fp: str):
audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
init_audio_shape = audio.shape
# convert stereo to mono
converted_to_mono = False
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
converted_to_mono = True
# resample audio to 16kHz
resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
audio_resampled = resampler(audio)
inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline
# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
# recognize speech
pipeline_res = pipeline(inputs=inputs)
text = pipeline_res['text'][0] # unpack batch of size 1
# add technical information to the output
tech_data = pipeline_res
del tech_data['text']
tech_data['sampling_rate_orig'] = sampling_rate
tech_data['init_audio_shape'] = init_audio_shape
tech_data['converted_to_mono'] = converted_to_mono
tech_data['resampled_audio_shape'] = audio_resampled.shape
tech_data['inputs_shape'] = inputs.shape
tech_data['inputs_max'] = np.max(inputs).item()
tech_data['inputs_min'] = np.min(inputs).item()
tech_data_str = pformat(tech_data)
return text, tech_data_str
iface = gr.Interface(
fn=main,
inputs=gr.inputs.Audio(
source='microphone', type='filepath',
label='Запішыце аўдыяфайл, каб распазнаць маўленьне'
),
outputs=[
gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
],
title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
'Акустычная мадэль + моўная мадэль.'
),
)
iface.launch()
|