Spaces:
Running
Running
File size: 1,694 Bytes
d71b5df feb2a2b ded23d4 feb2a2b ded23d4 feb2a2b d71b5df feb2a2b ded23d4 feb2a2b 3702096 feb2a2b 3702096 d71b5df 3702096 d71b5df feb2a2b ded23d4 d71b5df ded23d4 feb2a2b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import json
import numpy as np
import torch
import torchaudio
from torchaudio.transforms import Resample
from huggingface_hub import hf_hub_download
import gradio as gr
from pipeline import PreTrainedPipeline
HF_HUB_URL = 'ales/wav2vec2-cv-be'
LM_HUB_FP = 'language_model/cv8be_5gram.bin'
def main(audio_fp: str):
audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)
# convert stereo to mono
converted_to_mono = False
if audio.shape[0] > 1:
audio = torch.mean(audio, dim=0, keepdim=True)
converted_to_mono = True
# resample audio to 16kHz
resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
audio_resampled = resampler(audio)
inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline
# download Language Model from HF Hub
lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)
# init pipeline
pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)
# recognize speech
res = pipeline(inputs=inputs)
# text_recognized = res['text'][0]
res['sampling_rate_orig'] = sampling_rate
res['init_audio_shape'] = audio.shape
res['converted_to_mono'] = converted_to_mono
res['inputs_shape'] = inputs.shape
res['inputs_max'] = np.max(inputs).item()
res['inputs_min'] = np.min(inputs).item()
res_str = json.dumps(res, indent=2)
return res_str
iface = gr.Interface(
fn=main,
inputs=gr.inputs.Audio(
source='microphone', type='filepath',
label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
),
outputs='text'
)
iface.launch()
|