viktor-enzell's picture
Prettier interface and ability to download transcript.
cca4571
raw
history blame
2.15 kB
import streamlit as st
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
import torch
import torchaudio
import torchaudio.functional as F
st.set_page_config(
page_title="Swedish Speech-to-Text",
page_icon="๐ŸŽ™๏ธ"
)
st.image(
"https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/320/apple/325/studio-microphone_1f399-fe0f.png",
width=100,
)
st.markdown("""
# Swedish high-quality transcription
Generate Swedish transcripts for download from an audio file with this high-quality speech-to-text model. The model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
""")
model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
def run_inference(file):
waveform, sample_rate = torchaudio.load(file)
if sample_rate == 16_000:
waveform = waveform[0]
else:
waveform = F.resample(waveform, sample_rate, 16_000)[0]
inputs = processor(
waveform,
sampling_rate=16_000,
return_tensors="pt",
padding=True
).to(device)
with torch.no_grad():
logits = model(**inputs).logits
return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
if uploaded_file is not None:
if uploaded_file.type != "audio/wav":
pass
# TODO: convert to wav
# bytes = uploaded_file.getvalue()
# audio_input = ffmpeg.input(bytes).audio
# audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
# ffmpeg.run(audio_output)
transcript = run_inference(uploaded_file)
st.download_button("Download transcript", transcript,
f"{uploaded_file.name}-swedish-transcript.txt")
with st.expander("Transcript", expanded=True):
st.write(transcript)