Spaces:
Runtime error
Runtime error
File size: 4,755 Bytes
fd585a0 91a1a24 fd585a0 91a1a24 0f1ce4a 91a1a24 fd585a0 91a1a24 fd585a0 91a1a24 fd585a0 0f1ce4a 91a1a24 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 |
import os
import gradio as gr
import whisper
import time
model = whisper.load_model("base")
def transcribe(audio, state={}, delay=0.2, lang=None, translate=False):
time.sleep(delay)
transcription = model.transcribe(
audio,
language = lang if lang != "auto" else None
)
state['transcription'] += transcription['text'] + " "
if translate:
x = whisper.load_audio(audio)
x = whisper.pad_or_trim(x)
mel = whisper.log_mel_spectrogram(x).to(model.device)
options = whisper.DecodingOptions(task = "translation")
translation = whisper.decode(model, mel, options)
state['translation'] += translation.text + " "
return state['transcription'], state['translation'], state, f"detected language: {transcription['language']}"
title = "OpenAI's Whisper Real-time Demo"
description = "A simple demo of OpenAI's [**Whisper**](https://github.com/openai/whisper) speech recognition model."
delay_slider = gr.inputs.Slider(minimum=0, maximum=5, default=0.2, label="Rate of transcription (1 sec + this value)")
lang_dropdown = gr.inputs.Dropdown(choices=["auto", "english", "afrikaans",
"albanian", "amharic", "arabic",
"armenian", "assamese", "azerbaijani",
"bashkir", "basque", "belarusian",
"bengali", "bosnian", "breton",
"bulgarian", "catalan", "chinese",
"croatian", "czech", "danish",
"dutch", "estonian", "faroese",
"finnish", "french", "galician",
"georgian", "german", "greek",
"gujarati", "haitian creole", "hausa",
"hawaiian", "hebrew", "hindi",
"hungarian", "icelandic", "indonesian",
"italian", "japanese", "javanese",
"kannada", "kazakh", "khmer",
"korean", "kyrgyz", "lao",
"latin", "latvian", "lingala",
"lithuanian", "luxembourgish", "macedonian",
"malagasy", "malay", "malayalam",
"maltese", "maori", "marathi",
"mongolian", "myanmar", "nepali",
"norwegian", "nyanja", "nynorsk",
"occitan", "oriya", "pashto",
"persian", "polish", "portuguese",
"punjabi", "romanian", "russian",
"sanskrit", "sardinian", "serbian",
"shona", "sindhi", "sinhala",
"slovak", "slovenian", "somali",
"spanish", "sundanese", "swahili",
"swedish", "tagalog", "tajik",
"tamil", "tatar", "telugu",
"thai", "tigrinya", "tibetan",
"turkish", "turkmen", "ukrainian",
"urdu", "uzbek", "vietnamese",
"welsh", "xhosa", "yiddish",
"yoruba"],
label="Language", default="auto", type="value")
translate_checkbox = gr.inputs.Checkbox(label="Translate to English", default=False)
transcription_tb = gr.Textbox(label="Transcription", lines=10, max_lines=20)
translation_tb = gr.Textbox(label="Translation", lines=10, max_lines=20)
detected_lang = gr.outputs.HTML(label="Detected Language")
state = gr.State({"transcription": "", "translation": ""})
gr.Interface(
fn=transcribe,
inputs=[
gr.Audio(source="microphone", type="filepath", streaming=True),
state,
delay_slider,
lang_dropdown,
translate_checkbox
],
outputs=[
transcription_tb,
translation_tb,
state,
detected_lang
],
live=True,
allow_flagging='never',
title=title,
description=description,
).launch(
# enable_queue=True,
# debug=True
) |