w2v2asr / app.py
clr's picture
Update app.py
7bc8c40 verified
raw
history blame
3.7 kB
import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
pipe_is = pipeline(model=MODEL_IS)
pipe_fo = pipeline(model=MODEL_FO)
def readwav(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def recc(audio_file,model,processor):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor.batch_decode(pred_ids)
return xcp[0]
def recis(audio_file):
single_output = recc(audio_file,model_is,processor_is)
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
return (single_output, chunk_output)
def recfo(audio_file):
single_output = recc(audio_file,model_fo,processor_fo)
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
return (single_output, chunk_output)
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
# Speech recognition
### Users logged in to a Huggingface account can use each model's normal hosted inference API instead.
## * * * * * * * *
Upload a file for recognition with
https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
- Wav2Vec2 models have no language model (yet), so it can generate non-words.
- Whisper can hallucinate.
- Send errors/bugs to [email protected]
"""
)
with gr.Tabs():
with gr.TabItem("Icelandic"):
with gr.Row():
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
whisper_output = gr.Textbox(label="Whisper recognition")
text_button = gr.Button("Recognise Icelandic")
#text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
text_button.click(recis, inputs=audio_file, outputs=[chunk_output])
with gr.TabItem("Faroese"):
with gr.Row():
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
text_button = gr.Button("Recognise Faroese")
#text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
bl.launch()