Spaces:
Running
Running
File size: 2,613 Bytes
6becda8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import os
import gradio as gr
from pydub import AudioSegment
import pyaudioconvert as pac
import torch
import torchaudio
import sox
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor,Wav2Vec2ProcessorWithLM
def convert (audio):
file_name = audio
if file_name.endswith("mp3") or file_name.endswith("wav") or file_name.endswith("ogg"):
if file_name.endswith("mp3"):
sound = AudioSegment.from_mp3(file_name)
sound.export(audio, format="wav")
elif file_name.endswith("ogg"):
sound = AudioSegment.from_ogg(audio)
sound.export(audio, format="wav")
else:
return False
pac.convert_wav_to_16bit_mono(audio,audio)
return True
def parse_transcription_with_lm(logits):
result = processor_with_LM.batch_decode(logits.cpu().numpy())
text = result.text
transcription = text[0].replace('<s>','')
return transcription
def parse_transcription(logits):
predicted_ids = torch.argmax(logits, dim=-1)
transcription = processor.decode(predicted_ids[0], skip_special_tokens=True)
return transcription
def transcribe(audio, audio_microphone, applyLM):
audio_path = audio_microphone if audio_microphone else audio
speech_array, sampling_rate = torchaudio.load(audio_path)
speech = torchaudio.functional.resample(speech_array, orig_freq=sampling_rate, new_freq=16000).squeeze().numpy()
"""
if convert(audio_path)== False:
return "The format must be mp3,wav and ogg"
speech, sample_rate = torchaudio.load(audio_path)
"""
inputs = processor(speech, sampling_rate=16_000, return_tensors="pt", padding=True)
with torch.no_grad():
logits = model(inputs.input_values).logits
if applyLM:
return parse_transcription_with_lm(logits)
else:
return parse_transcription(logits)
auth_token = os.environ.get("key") or True
model_id = "mutisya/wav2vec2-300m-kik-t22-1k-ft-withLM"
processor = Wav2Vec2Processor.from_pretrained(model_id, use_auth_token=auth_token)
processor_with_LM = Wav2Vec2ProcessorWithLM.from_pretrained(model_id, use_auth_token=auth_token)
model = Wav2Vec2ForCTC.from_pretrained(model_id, use_auth_token=auth_token)
gradio_ui = gr.Interface(
fn=transcribe,
title="Kikuyu Speech Recognition",
description="",
inputs=[gr.Audio(label="Upload Audio File", type="filepath", optional=True),
gr.Audio(source="microphone", type="filepath", optional=True, label="Record from microphone"),
gr.Checkbox(label="Apply LM", value=False)],
outputs=[gr.outputs.Textbox(label="Recognized speech")]
)
gradio_ui.launch(enable_queue=True) |