Spaces:
Runtime error
Runtime error
File size: 3,204 Bytes
aca9f3d feb2a2b 6b93fd2 51f7123 feb2a2b 6b93fd2 feb2a2b 1c6b627 feb2a2b 6b93fd2 feb2a2b 44daa8d feb2a2b 44daa8d 1c6b627 6b93fd2 44daa8d 51f7123 3702096 44daa8d d71b5df 44daa8d feb2a2b 6b93fd2 5b4ea6e abfa68a 5b4ea6e feb2a2b 1c6b627 44daa8d 1022fd5 44daa8d cff8d27 6b93fd2 1c6b627 feb2a2b 5b4ea6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from pprint import pformat
from huggingface_hub import hf_hub_download
import librosa
import gradio as gr
from pipeline import PreTrainedPipeline
HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
MODEL_SAMPLING_RATE = 16_000 # 16kHz
# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
def main(recorded_audio_fp: str, uploaded_audio_fp: str):
audio_fp = None
if recorded_audio_fp is not None:
audio_fp = recorded_audio_fp
used_audiofile = 'recorded'
elif uploaded_audio_fp is not None:
audio_fp = uploaded_audio_fp
used_audiofile = 'uploaded'
else:
return (
'Памылка! Вы мусіце альбо запісаць, альбо запампаваць аўдыяфайл.',
'Error! You have to either record or upload an audiofile.'
)
# read audio file
inputs = librosa.load(audio_fp, sr=MODEL_SAMPLING_RATE, mono=True)[0]
# recognize speech
pipeline_res = pipeline(inputs=inputs)
text = pipeline_res['text'][0] # unpack batch of size 1
# add technical information to the output
tech_data = pipeline_res
del tech_data['text']
tech_data['used_audiofile'] = used_audiofile
tech_data['recorded_file_present'] = recorded_audio_fp is not None
tech_data['uploaded_file_present'] = uploaded_audio_fp is not None
tech_data['audiofile_path'] = audio_fp
tech_data['model_sampling_rate'] = MODEL_SAMPLING_RATE
tech_data['inputs_shape'] = inputs.shape
tech_data['inputs_max'] = inputs.max().item()
tech_data['inputs_min'] = inputs.min().item()
tech_data_str = pformat(tech_data)
return text, tech_data_str
article = """
The model used can be found here: [ales/wav2vec2-cv-be](https://huggingface.co/ales/wav2vec2-cv-be)
![Page Visits](https://visitor-badge.glitch.me/badge?page_id=huggingface.co/spaces/ales/wav2vec2-cv-be-lm&left_color=darkgray&right_color=crimson&left_text=Page%20Visits)
"""
iface = gr.Interface(
fn=main,
inputs=[
gr.inputs.Audio(
source='microphone', type='filepath',
label='Запішыце аўдыяфайл, каб распазнаць маўленьне',
optional=True,
),
gr.inputs.Audio(
source='upload', type='filepath',
label='Альбо загрузіце ўжо запісаны аўдыяфайл сюды',
optional=True
),
],
outputs=[
gr.outputs.Textbox(type='str', label='Распазнаны тэкст'),
gr.outputs.Textbox(type='str', label='Тэхнічная інфармацыя')
],
title='wav2vec2 fine-tuned on CommonVoice 8 Be + Language Model',
description=('Мадэль распазнаваньня беларускага маўленьня, навучаная на датсэце Common Voice 8.\n'
'Акустычная мадэль + моўная мадэль.'
),
article=article
)
iface.launch(enable_queue=True)
|