File size: 3,697 Bytes
dd2b02c 56f1ec9 0e17f64 a64c958 56f1ec9 738c923 6b73084 544f10c 0e89078 72390b0 56f1ec9 0e89078 d4d3d57 738c923 c418387 738c923 5adac55 56f1ec9 5adac55 0e17f64 5adac55 56f1ec9 0e17f64 509f052 0e17f64 a16e474 0e17f64 c9afa80 c418387 c9afa80 c793959 5adac55 c418387 0e17f64 f9a5e8b d4d3d57 0e89078 544f10c 161fbd1 544f10c 161fbd1 0e89078 544f10c 0e89078 d4d3d57 544f10c a9ad990 0e89078 7bc8c40 c9afa80 7bc8c40 f815a95 7bc8c40 0e89078 7bc8c40 c418387 7bc8c40 f815a95 7bc8c40 79be1a5 0e89078 79be1a5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
import gradio as gr
import soundfile as sf
from scipy import signal
import numpy as np
import torch, torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, pipeline
MODEL_IS="language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h"
MODEL_FO="carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h"
torch.random.manual_seed(0)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_is = Wav2Vec2ForCTC.from_pretrained(MODEL_IS).to(device)
processor_is = Wav2Vec2Processor.from_pretrained(MODEL_IS)
model_fo = Wav2Vec2ForCTC.from_pretrained(MODEL_FO).to(device)
processor_fo = Wav2Vec2Processor.from_pretrained(MODEL_FO)
pipe_is = pipeline(model=MODEL_IS)
pipe_fo = pipeline(model=MODEL_FO)
def readwav(a_f):
wav, sr = sf.read(a_f, dtype=np.float32)
if len(wav.shape) == 2:
wav = wav.mean(1)
if sr != 16000:
wlen = int(wav.shape[0] / sr * 16000)
wav = signal.resample(wav, wlen)
return wav
def recc(audio_file,model,processor):
wav = readwav(audio_file)
with torch.inference_mode():
input_values = processor(wav,sampling_rate=16000).input_values[0]
input_values = torch.tensor(input_values, device=device).unsqueeze(0)
logits = model(input_values).logits
pred_ids = torch.argmax(logits, dim=-1)
xcp = processor.batch_decode(pred_ids)
return xcp[0]
def recis(audio_file):
single_output = recc(audio_file,model_is,processor_is)
chunk_output = pipe_is(audio_file, chunk_length_s=4)['text']
return (single_output, chunk_output)
def recfo(audio_file):
single_output = recc(audio_file,model_fo,processor_fo)
chunk_output = pipe_fo(audio_file, chunk_length_s=4)['text']
return (single_output, chunk_output)
bl = gr.Blocks()
with bl:
gr.Markdown(
"""
# Speech recognition
### Users logged in to a Huggingface account can use each model's normal hosted inference API instead.
## * * * * * * * *
Upload a file for recognition with
https://huggingface.co/language-and-voice-lab/wav2vec2-large-xlsr-53-icelandic-ep30-967h
or https://huggingface.co/carlosdanielhernandezmena/wav2vec2-large-xlsr-53-faroese-100h
- Wav2Vec2 models have no language model (yet), so it can generate non-words.
- Whisper can hallucinate.
- Send errors/bugs to [email protected]
"""
)
with gr.Tabs():
with gr.TabItem("Icelandic"):
with gr.Row():
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
whisper_output = gr.Textbox(label="Whisper recognition")
text_button = gr.Button("Recognise Icelandic")
#text_button.click(recis, inputs=audio_file, outputs=[whole_output,chunk_output])
text_button.click(recis, inputs=audio_file, outputs=[chunk_output])
with gr.TabItem("Faroese"):
with gr.Row():
audio_file = gr.Audio(sources=["upload", "microphone"],type="filepath")
with gr.Column():
#whole_output = gr.Textbox(label="whole-file recognition")
chunk_output = gr.Textbox(label="Wav2Vec2 recognition")# with chunking")
text_button = gr.Button("Recognise Faroese")
#text_button.click(recfo, inputs=audio_file, outputs=[whole_output,chunk_output])
text_button.click(recfo, inputs=audio_file, outputs=[chunk_output])
bl.launch()
|