Spaces:

dragonSwing
/

wav2vec2-vi-asr

Runtime error

App Files Files Community

wav2vec2-vi-asr / app.py

dragonSwing

Upload files

7bfa718 over 3 years ago

raw

history blame

3.23 kB

	import gradio as gr
	import torch
	import zipfile
	from pyctcdecode import build_ctcdecoder
	from speechbrain.pretrained import EncoderASR
	from transformers.file_utils import cached_path, hf_bucket_url

	cache_dir = './cache/'
	lm_file = hf_bucket_url("dragonSwing/wav2vec2-base-vn-270h", filename='4gram.zip')
	lm_file = cached_path(lm_file, cache_dir=cache_dir)
	with zipfile.ZipFile(lm_file, 'r') as zip_ref:
	zip_ref.extractall(cache_dir)
	lm_file = cache_dir + 'lm.binary'
	vocab_file = cache_dir + 'vocab-260000.txt'
	model = EncoderASR.from_hparams(source="dragonSwing/wav2vec2-base-vn-270h",
	savedir="/content/pretrained2/"
	)

	def get_decoder_ngram_model(tokenizer, ngram_lm_path, vocab_path=None):
	unigrams = None
	if vocab_path is not None:
	unigrams = []
	with open(vocab_path, encoding='utf-8') as f:
	for line in f:
	unigrams.append(line.strip())

	vocab_dict = tokenizer.get_vocab()
	sort_vocab = sorted((value, key) for (key, value) in vocab_dict.items())
	vocab = [x[1] for x in sort_vocab]
	vocab_list = vocab

	# convert ctc blank character representation
	vocab_list[tokenizer.pad_token_id] = ""
	# replace special characters
	vocab_list[tokenizer.word_delimiter_token_id] = " "
	# specify ctc blank char index, since conventially it is the last entry of the logit matrix
	decoder = build_ctcdecoder(vocab_list, ngram_lm_path, unigrams=unigrams)
	return decoder

	ngram_lm_model = get_decoder_ngram_model(model.tokenizer, lm_file, vocab_file)

	def transcribe_file(path, max_seconds=20):
	waveform = model.load_audio(path)
	if max_seconds > 0:
	waveform = waveform[:max_seconds*16000]
	batch = waveform.unsqueeze(0)
	rel_length = torch.tensor([1.0])
	with torch.no_grad():
	logits = model(batch, rel_length)
	text_batch = [ngram_lm_model.decode(logit.detach().cpu().numpy(), beam_width=500) for logit in logits]
	return text_batch[0]

	def speech_recognize(file_upload, file_mic):
	if file_upload is not None:
	file = file_upload
	elif file_mic is not None:
	file = file_mic
	else:
	return ""
	# text = model.transcribe_file(file)
	text = transcribe_file(file)
	return text

	inputs = [gr.inputs.Audio(source="upload", type='filepath', optional=True), gr.inputs.Audio(source="microphone", type='filepath', optional=True)]
	outputs = gr.outputs.Textbox(label="Output Text")
	title = "wav2vec2-base-vietnamese-270h"
	description = "Gradio demo for a wav2vec2 base vietnamese speech recognition. To use it, simply upload your audio, click one of the examples to load them, or record from your own microphone. Read more at the links below. Currently supports 16_000hz audio files"
	article = "<p style='text-align: center'><a href='https://huggingface.co/dragonSwing/wav2vec2-base-vn-270h' target='_blank'>Pretrained model</a></p>"
	examples=[['example1.wav', 'example1.wav'], ['example2.mp3', 'example2.mp3'], ['example3.mp3', 'example3.mp3'], ['example4.wav', 'example4.wav']]
	gr.Interface(speech_recognize, inputs, outputs, title=title, description=description, article=article, examples=examples).launch()