Spaces:

viktor-enzell
/

wav2vec2-large-voxrex-swedish-4gram

Runtime error

App Files Files Community

viktor-enzell commited on Jun 4, 2022

Commit

5b95586

1 Parent(s): 36d5632

Prettier interface and ability to download transcript.

Browse files

Files changed (2) hide show

README.md +3 -3
app.py +29 -8

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 title: Wav2vec2 Large Voxrex Swedish 4gram
-emoji: 📈
-colorFrom: purple
-colorTo: green
 sdk: streamlit
 sdk_version: 1.9.0
 app_file: app.py

 ---
 title: Wav2vec2 Large Voxrex Swedish 4gram
+emoji: 🎙️
+colorFrom: orange
+colorTo: black
 sdk: streamlit
 sdk_version: 1.9.0
 app_file: app.py

app.py CHANGED Viewed

@@ -1,18 +1,27 @@
 import streamlit as st
 from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
 import torch
 import torchaudio
 import torchaudio.functional as F
 st.set_page_config(
-    page_title='Swedish Speech-to-Text',
-    page_icon='🎙️'
 )
-# Import model and processor
-model_name = 'viktor-enzell/wav2vec2-large-voxrex-swedish-4gram'
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
 processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
@@ -28,7 +37,7 @@ def run_inference(file):
     inputs = processor(
         waveform,
         sampling_rate=16_000,
-        return_tensors='pt',
         padding=True
     ).to(device)
@@ -38,8 +47,20 @@ def run_inference(file):
     return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
-uploaded_file = st.file_uploader('Choose a file', type=['.wav'])
 if uploaded_file is not None:
     transcript = run_inference(uploaded_file)
-    st.write(transcript)

 import streamlit as st
 from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
+import ffmpeg
 import torch
 import torchaudio
 import torchaudio.functional as F
 st.set_page_config(
+    page_title="Swedish Speech-to-Text",
+    page_icon="🎙️"
+)
+st.image(
+    "https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/320/apple/325/studio-microphone_1f399-fe0f.png",
+    width=100,
 )
+st.markdown("""
+# Swedish high-quality transcription
+Generate Swedish transcripts for download from an audio file with this high-quality speech-to-text model. The model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
+""")
+model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
 processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
     inputs = processor(
         waveform,
         sampling_rate=16_000,
+        return_tensors="pt",
         padding=True
     ).to(device)
     return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
+uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
 if uploaded_file is not None:
+    if uploaded_file.type != "audio/wav":
+        pass
+        # TODO: convert to wav
+        # bytes = uploaded_file.getvalue()
+        # audio_input = ffmpeg.input(bytes).audio
+        # audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
+        # ffmpeg.run(audio_output)
     transcript = run_inference(uploaded_file)
+    st.download_button("Download transcript", transcript,
+                       f"{uploaded_file.name}-swedish-transcript.txt")
+    with st.expander("Transcript", expanded=True):
+        st.write(transcript)