Spaces:

viktor-enzell
/

wav2vec2-large-voxrex-swedish-4gram

Runtime error

App Files Files Community

wav2vec2-large-voxrex-swedish-4gram / app.py

viktor-enzell

Prettier interface and ability to download transcript.

cca4571 almost 3 years ago

raw

history blame

2.15 kB

	import streamlit as st
	from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
	import torch
	import torchaudio
	import torchaudio.functional as F


	st.set_page_config(
	page_title="Swedish Speech-to-Text",
	page_icon="🎙️"
	)
	st.image(
	"https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/320/apple/325/studio-microphone_1f399-fe0f.png",
	width=100,
	)
	st.markdown("""
	# Swedish high-quality transcription

	Generate Swedish transcripts for download from an audio file with this high-quality speech-to-text model. The model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
	""")

	model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
	processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)


	def run_inference(file):
	waveform, sample_rate = torchaudio.load(file)

	if sample_rate == 16_000:
	waveform = waveform[0]
	else:
	waveform = F.resample(waveform, sample_rate, 16_000)[0]

	inputs = processor(
	waveform,
	sampling_rate=16_000,
	return_tensors="pt",
	padding=True
	).to(device)

	with torch.no_grad():
	logits = model(**inputs).logits

	return processor.batch_decode(logits.cpu().numpy()).text[0].lower()


	uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
	if uploaded_file is not None:
	if uploaded_file.type != "audio/wav":
	pass
	# TODO: convert to wav
	# bytes = uploaded_file.getvalue()
	# audio_input = ffmpeg.input(bytes).audio
	# audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
	# ffmpeg.run(audio_output)

	transcript = run_inference(uploaded_file)

	st.download_button("Download transcript", transcript,
	f"{uploaded_file.name}-swedish-transcript.txt")

	with st.expander("Transcript", expanded=True):
	st.write(transcript)