Spaces:

ales
/

wav2vec2-cv-be-lm

Runtime error

wav2vec2-cv-be-lm / app.py

converting stereo audio to mono if needed

d71b5df about 3 years ago

1.69 kB

	import json

	import numpy as np

	import torch
	import torchaudio
	from torchaudio.transforms import Resample

	from huggingface_hub import hf_hub_download

	import gradio as gr

	from pipeline import PreTrainedPipeline


	HF_HUB_URL = 'ales/wav2vec2-cv-be'
	LM_HUB_FP = 'language_model/cv8be_5gram.bin'


	def main(audio_fp: str):
	audio, sampling_rate = torchaudio.load(audio_fp, normalize=True)

	# convert stereo to mono
	converted_to_mono = False
	if audio.shape[0] > 1:
	audio = torch.mean(audio, dim=0, keepdim=True)
	converted_to_mono = True

	# resample audio to 16kHz
	resampler = Resample(orig_freq=sampling_rate, new_freq=16_000)
	audio_resampled = resampler(audio)
	inputs = audio_resampled.numpy().flatten() # cast to numpy as expected by the pipeline

	# download Language Model from HF Hub
	lm_fp = hf_hub_download(repo_id=HF_HUB_URL, filename=LM_HUB_FP)

	# init pipeline
	pipeline = PreTrainedPipeline(model_path=HF_HUB_URL, language_model_fp=lm_fp)

	# recognize speech
	res = pipeline(inputs=inputs)
	# text_recognized = res['text'][0]

	res['sampling_rate_orig'] = sampling_rate
	res['init_audio_shape'] = audio.shape
	res['converted_to_mono'] = converted_to_mono
	res['inputs_shape'] = inputs.shape
	res['inputs_max'] = np.max(inputs).item()
	res['inputs_min'] = np.min(inputs).item()

	res_str = json.dumps(res, indent=2)

	return res_str


	iface = gr.Interface(
	fn=main,
	inputs=gr.inputs.Audio(
	source='microphone', type='filepath',
	label='Запішыце аўдыяфайл, каб распазнаваць маўленне'
	),
	outputs='text'
	)

	iface.launch()