Spaces:
Running
Running
import os | |
os.system("pip install git+https://github.com/openai/whisper.git") | |
import gradio as gr | |
import whisper | |
model = whisper.load_model("small") | |
def predict(audio, mic_audio=None): | |
# audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels)) | |
if mic_audio is not None: | |
input_audio = mic_audio | |
elif audio is not None: | |
input_audio = audio | |
else: | |
return "(please provide audio)" | |
audio = whisper.load_audio(input_audio) | |
audio = whisper.pad_or_trim(audio) | |
mel = whisper.log_mel_spectrogram(audio).to(model.device) | |
_, probs = model.detect_language(mel) | |
options = whisper.DecodingOptions(fp16 = False) | |
result = whisper.decode(model, mel, options) | |
print(result.text) | |
return result.text, gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) | |
title = "Demo for Whisper -> Something -> XLS-R" | |
description = """ | |
<b>How to use:</b> Upload an audio file or record using the microphone. The audio is converted to mono and resampled to 16 kHz before | |
being passed into the model. The output is the text transcription of the audio. | |
""" | |
gr.Interface( | |
fn=predict, | |
inputs=[ | |
gr.Audio(label="Upload Speech", source="upload", type="filepath"), | |
gr.Audio(label="Record Speech", source="microphone", type="filepath"), | |
], | |
outputs=[ | |
gr.Text(label="Transcription"), | |
], | |
title=title, | |
description=description, | |
).launch() |