Spaces:
Running
Running
# imports | |
import os | |
os.system("pip install git+https://github.com/openai/whisper.git") | |
import gradio as gr | |
import whisper | |
# the model we are using for ASR, options are small, medium, large and largev2 (large and largev2 don't fit on huggingface cpu) | |
model = whisper.load_model("small") | |
# A table to look up all the languages | |
language_id_lookup = { | |
"Arabic" : "ar", | |
"English" : "en", | |
"Chinese" : "zh", | |
"German" : "de", | |
"Spanish" : "es", | |
"Russian" : "ru", | |
"French" : "fr", | |
} | |
# The predict function. audio, language and mic_audio are all parameters directly passed by gradio | |
# which means they are user inputted. They are specified in gr.inputs[] block at the bottom. The | |
# gr.outputs[] block will specify the output type. | |
def predict(audio, language, mic_audio=None): | |
# checks if mic_audio is used, otherwise feeds model uploaded audio | |
if mic_audio is not None: | |
input_audio = mic_audio | |
elif audio is not None: | |
input_audio = audio | |
else: | |
return "(please provide audio)" | |
# Uses the model's preprocessing methods to preprocess audio | |
audio = whisper.load_audio(input_audio) | |
audio = whisper.pad_or_trim(audio) | |
# Calculates the mel frequency spectogram | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
# if model is supposed to detect language, set outLanguage to None | |
# otherwise set to specified language | |
if(language == "Detect Language"): | |
outLanguage = None | |
else: | |
outLanguage = language_id_lookup[language.split()[0]] | |
options = whisper.DecodingOptions(fp16 = False, language = outLanguage) | |
result = whisper.decode(model, mel, options) | |
outLanguage = result.language | |
print(result.text + " " + outLanguage) | |
return result.text, outLanguage | |
title = "Demo for Whisper -> Something -> XLS-R" | |
description = """ | |
<b>How to use:</b> Upload an audio file or record using the microphone. The audio is into the whisper model developed by openai. | |
The output is the text transcription of the audio in the language you inputted. If you asked the model to detect a language, it will | |
tell you what language it detected. | |
""" | |
gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Audio(label="Upload Speech", source="upload", type="filepath"), | |
gr.inputs.Dropdown(['Arabic Text', | |
'Chinese Text', | |
'English Text', | |
'German Text', | |
'Spanish Text', | |
'Russian Text', | |
'French Text', | |
'Detect Language'], type="value", default='English Text', label="Select the Language of the that you are speaking in."), | |
gr.Audio(label="Record Speech", source="microphone", type="filepath"), | |
], | |
outputs=[ | |
gr.Text(label="Transcription"), | |
], | |
title=title, | |
description=description, | |
).launch() |