framajawa's picture
Duplicate from dmatekenya/Chichewa-Automatic-Transcription
3564020
from transformers import pipeline
from transformers import WhisperForConditionalGeneration, WhisperProcessor, WhisperFeatureExtractor
import gradio as gr
import librosa
# Prepare model for prediction
MODEL_SPECS_ID = "dmatekenya/whisper-small_finetuned_sh_chich"
MODEL_SPECS_BASE_ID = "openai/whisper-small"
MODEL_SPECS_BASE_LAN_SW = "swahili"
MODEL_SPECS_BASE_LAN_SH = "shona"
FEATURE_EXTRACTOR = WhisperFeatureExtractor.from_pretrained(MODEL_SPECS_ID)
PROCESSOR_SH = WhisperProcessor.from_pretrained(MODEL_SPECS_BASE_ID,
language=MODEL_SPECS_BASE_LAN_SH, task="transcribe")
MODEL = WhisperForConditionalGeneration.from_pretrained(MODEL_SPECS_ID)
def transcribe(audio_file):
y, sr = librosa.load(audio_file, sr=16000)
input_features = PROCESSOR_SH(y, return_tensors="pt", sampling_rate=sr).input_features
generated_ids = MODEL.generate(inputs=input_features)
transcription = PROCESSOR_SH.batch_decode(generated_ids, skip_special_tokens=True)[0]
return transcription
def transcribe_audio(mic=None, file=None):
if mic is not None:
audio = mic
elif file is not None:
audio = file
else:
return "You must either provide a mic recording or a file"
transcription = transcribe(audio_file=audio)
return transcription
title = "Transcribe Chichewa Audio"
description = """
<img src="https://i.ibb.co/5nQdGSs/logo.png">
IN THIS DEMO, TEST THE FIRST AUTOMATED SPEECH RECOGNITION (ASR) MODEL FOR CHICHEWA BY TRANSCRIBING YOUR CHICHEWA VOICE NOTES.
FOR AUDIO FILES, PLEASE UPLOAD SHORT VOICE NOTES ONLY (NO LONGER THAN 30 SEC).
"""
article = "Read more about the [ChichewaSpeech2Text](https://dmatekenya.github.io/Chichewa-Speech2Text/README.html) project \
and make sure to sign-up for our first [voice note donation event](https://forms.gle/fHLESutofVvb2YFM9) on July 22. \
You stand a chance to win Airtel or TNM units if you choose to participate in the raffle after the event"
gr.Interface(
fn=transcribe_audio,
theme='grass',
title=title,
description=description,
article=article,
inputs=[
gr.Audio(source="microphone", type="filepath", optional=True),
gr.Audio(source="upload", type="filepath", optional=True),
],
outputs="text",
).launch()