Spaces:
Runtime error
Runtime error
File size: 2,659 Bytes
fd585a0 9b33f31 fd585a0 492c47b 9b33f31 fd585a0 492c47b 56ba41e fd585a0 492c47b 91a1a24 2bf0c14 91a1a24 492c47b 0f1ce4a 91a1a24 492c47b 401c5ee 9b33f31 f9e42f2 9b33f31 f9e42f2 9b33f31 91a1a24 fd585a0 91a1a24 492c47b 91a1a24 fd585a0 91a1a24 fd585a0 0f1ce4a 91a1a24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import os
import gradio as gr
import whisper
from whisper import tokenizer
import time
current_size = 'base'
model = whisper.load_model(current_size)
AUTO_DETECT_LANG = "Auto Detect"
def transcribe(audio, state={}, model_size='base', delay=1.2, lang=None, translate=False):
time.sleep(delay - 1)
if model_size != current_size:
model = whisper.load_model(model_size)
current_size = model_size
transcription = model.transcribe(
audio,
language = lang if lang != AUTO_DETECT_LANG else None
)
state['transcription'] += transcription['text'] + " "
if translate:
x = whisper.load_audio(audio)
x = whisper.pad_or_trim(x)
mel = whisper.log_mel_spectrogram(x).to(model.device)
options = whisper.DecodingOptions(task = "translation")
translation = whisper.decode(model, mel, options)
state['translation'] += translation.text + " "
return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
title = "OpenAI's Whisper Real-time Demo"
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model."
model_size = gr.Dropdown(label="Model size", choices=['base', 'tiny', 'small', 'medium', 'large'], value='base')
delay_slider = gr.inputs.Slider(minimum=1, maximum=5, default=1.2, label="Rate of transcription, s")
available_languages = sorted(tokenizer.TO_LANGUAGE_CODE.keys())
available_languages = [lang.capitalize() for lang in available_languages]
available_languages = [AUTO_DETECT_LANG]+available_languages
lang_dropdown = gr.inputs.Dropdown(choices=available_languages, label="Language", default=AUTO_DETECT_LANG, type="value")
if lang_dropdown==AUTO_DETECT_LANG:
lang_dropdown=None
translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)
transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
detected_lang = gr.outputs.HTML(label="Detected Language")
state = gr.State({"transcription": "", "translation": ""})
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
state,
model_size,
delay_slider,
lang_dropdown,
translate_checkbox
],
outputs=[
transcription_tb,
translation_tb,
state,
detected_lang
],
live=True,
allow_flagging='never',
title=title,
description=description,
).launch(
# enable_queue=True,
# debug=True
) |