Spaces:
Runtime error
Runtime error
Commit
Β·
5b95586
1
Parent(s):
36d5632
Prettier interface and ability to download transcript.
Browse files
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: Wav2vec2 Large Voxrex Swedish 4gram
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.9.0
|
8 |
app_file: app.py
|
|
|
1 |
---
|
2 |
title: Wav2vec2 Large Voxrex Swedish 4gram
|
3 |
+
emoji: ποΈ
|
4 |
+
colorFrom: orange
|
5 |
+
colorTo: black
|
6 |
sdk: streamlit
|
7 |
sdk_version: 1.9.0
|
8 |
app_file: app.py
|
app.py
CHANGED
@@ -1,18 +1,27 @@
|
|
1 |
import streamlit as st
|
2 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
|
|
|
3 |
import torch
|
4 |
import torchaudio
|
5 |
import torchaudio.functional as F
|
6 |
|
|
|
7 |
st.set_page_config(
|
8 |
-
page_title=
|
9 |
-
page_icon=
|
|
|
|
|
|
|
|
|
10 |
)
|
|
|
|
|
11 |
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
16 |
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
|
17 |
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
|
18 |
|
@@ -28,7 +37,7 @@ def run_inference(file):
|
|
28 |
inputs = processor(
|
29 |
waveform,
|
30 |
sampling_rate=16_000,
|
31 |
-
return_tensors=
|
32 |
padding=True
|
33 |
).to(device)
|
34 |
|
@@ -38,8 +47,20 @@ def run_inference(file):
|
|
38 |
return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
|
39 |
|
40 |
|
41 |
-
uploaded_file = st.file_uploader(
|
42 |
if uploaded_file is not None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
transcript = run_inference(uploaded_file)
|
44 |
|
45 |
-
st.
|
|
|
|
|
|
|
|
|
|
1 |
import streamlit as st
|
2 |
from transformers import Wav2Vec2ForCTC, Wav2Vec2ProcessorWithLM
|
3 |
+
import ffmpeg
|
4 |
import torch
|
5 |
import torchaudio
|
6 |
import torchaudio.functional as F
|
7 |
|
8 |
+
|
9 |
st.set_page_config(
|
10 |
+
page_title="Swedish Speech-to-Text",
|
11 |
+
page_icon="ποΈ"
|
12 |
+
)
|
13 |
+
st.image(
|
14 |
+
"https://emojipedia-us.s3.dualstack.us-west-1.amazonaws.com/thumbs/320/apple/325/studio-microphone_1f399-fe0f.png",
|
15 |
+
width=100,
|
16 |
)
|
17 |
+
st.markdown("""
|
18 |
+
# Swedish high-quality transcription
|
19 |
|
20 |
+
Generate Swedish transcripts for download from an audio file with this high-quality speech-to-text model. The model is KBLab's wav2vec 2.0 large VoxRex Swedish (C) with a 4-gram language model, which you can access [here](https://huggingface.co/viktor-enzell/wav2vec2-large-voxrex-swedish-4gram).
|
21 |
+
""")
|
22 |
|
23 |
+
model_name = "viktor-enzell/wav2vec2-large-voxrex-swedish-4gram"
|
24 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
25 |
model = Wav2Vec2ForCTC.from_pretrained(model_name).to(device)
|
26 |
processor = Wav2Vec2ProcessorWithLM.from_pretrained(model_name)
|
27 |
|
|
|
37 |
inputs = processor(
|
38 |
waveform,
|
39 |
sampling_rate=16_000,
|
40 |
+
return_tensors="pt",
|
41 |
padding=True
|
42 |
).to(device)
|
43 |
|
|
|
47 |
return processor.batch_decode(logits.cpu().numpy()).text[0].lower()
|
48 |
|
49 |
|
50 |
+
uploaded_file = st.file_uploader("Choose a file", type=[".wav"])
|
51 |
if uploaded_file is not None:
|
52 |
+
if uploaded_file.type != "audio/wav":
|
53 |
+
pass
|
54 |
+
# TODO: convert to wav
|
55 |
+
# bytes = uploaded_file.getvalue()
|
56 |
+
# audio_input = ffmpeg.input(bytes).audio
|
57 |
+
# audio_output = ffmpeg.output(audio_input, "tmp.wav", format="wav")
|
58 |
+
# ffmpeg.run(audio_output)
|
59 |
+
|
60 |
transcript = run_inference(uploaded_file)
|
61 |
|
62 |
+
st.download_button("Download transcript", transcript,
|
63 |
+
f"{uploaded_file.name}-swedish-transcript.txt")
|
64 |
+
|
65 |
+
with st.expander("Transcript", expanded=True):
|
66 |
+
st.write(transcript)
|